Import Important and relevant libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

Load Dataset

In [22]:
# Load dataset with correct delimiter
df = pd.read_csv("student_data_with_dropout.csv")

In [23]:
df.head()

Unnamed: 0,Student_ID,Attendance_Percent,Fees_Status,Annual_Marks,Dropout_Likelihood
0,S001,53.09,Up-to-date,217,Dropout Likely
1,S002,66.02,Pending,290,Dropout Likely
2,S003,90.0,Pending,386,Likely to Enroll
3,S004,93.92,Pending,510,Likely to Enroll
4,S005,60.0,Up-to-date,460,At Risk


features selection

In [33]:
selected_features = [
    "Attendance_Percent",	
    "Fees_Status",
	"Annual_Marks",
]


Data Preprocessing

In [None]:
# Encode target column
le = LabelEncoder()

df['Fees_Status'] = le.fit_transform(df['Fees_Status'])

df['Dropout_Likelihood'] = le.fit_transform(df['Dropout_Likelihood'])


Seperating Dependent and Independent data

In [35]:
X = df[selected_features]
y = df['Dropout_Likelihood']

In [36]:
# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

Train Test spliting

In [37]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Model Training

In [38]:
# Train Random Forest
clf = RandomForestClassifier(n_estimators=110, random_state=42, class_weight="balanced")
clf.fit(X_train, y_train)

Model Evaluation

In [39]:
X_train_pred = clf.predict(X_train)
X_test_pred = clf.predict(X_test)
print("Train Classification Report:\n", classification_report(y_train, X_train_pred))
print("Test Classification Report:\n", classification_report(y_test, X_test_pred))
print("Train Confusion Matrix:\n", confusion_matrix(y_train, X_train_pred))
print("Test Confusion Matrix:\n", confusion_matrix(y_test, X_test_pred))

Train Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        29

    accuracy                           1.00        80
   macro avg       1.00      1.00      1.00        80
weighted avg       1.00      1.00      1.00        80

Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         7

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

Train Confusion Matrix:
 [[33  0  0]
 [ 0 18  0]
 [ 0  0 29]]
Test Confusion Matrix:
 [[9 0 0]
 [0 4 0]
 [0 0 7]]


In [62]:
def predict_dropout(student_data, model, le, feature_columns):
    """
    student_data: dict with student info
    model: trained ML model
    le: LabelEncoder fitted on target
    feature_columns: list of columns used in training
    """
    # Convert input dict → DataFrame
    X_new = pd.DataFrame([student_data])

    # One-hot encode categorical values
    X_new = pd.get_dummies(X_new)

    # Re-align with training columns
    X_new = X_new.reindex(columns=feature_columns, fill_value=0)

    # Predict class
    y_pred = model.predict(X_new)
    y_label = le.inverse_transform(y_pred)[0]

    return y_label

# -------------------
# Example student input
# -------------------
student = {
    "Attendance_Percent": 0,     
    "Fees_Status": "Pending",    
    "Annual_Marks": 0       
}

# -------------------
# Test prediction
# -------------------
result = predict_dropout(student, clf, le, X_train.columns)
print("Prediction:", result)

Prediction: 1
