In [48]:
import pandas as pd
import numpy as np
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [49]:
df = pd.read_csv("student_igcse_dataset.csv")
df

Unnamed: 0,student_id,entry_grade,age_at_entry,gender,parent_education,family_income_bracket,language_proficiency,attention_span_score,social_skills_score,cognitive_test_score,has_learning_difficulty,parental_involvement_score,high_IGCSE_performance
0,S1000,3rd,3.3,Male,Master's,Middle,Medium,2,4,85,0,1,1
1,S1001,UKG,7.8,Female,Bachelor's,Middle,Low,4,4,110,0,3,1
2,S1002,UKG,4.8,Male,Master's,Middle,Low,1,2,123,0,3,1
3,S1003,2nd,3.2,Female,Master's,Middle,Medium,4,2,99,0,5,1
4,S1004,UKG,7.0,Female,High School,Middle,Medium,4,3,73,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,S1495,9th,4.5,Male,High School,High,Low,3,5,114,0,5,1
496,S1496,UKG,4.5,Male,Master's,Middle,Medium,3,5,130,1,2,1
497,S1497,6th,4.2,Female,Bachelor's,Low,High,5,3,75,0,4,1
498,S1498,9th,3.9,Female,PhD,Middle,High,3,1,93,0,1,1


In [81]:
label_cols = ['student_id', 'entry_grade', 'parent_education', 'family_income_bracket', 'language_proficiency']
le = LabelEncoder()

for col in label_cols:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

In [83]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df

Unnamed: 0,student_id,entry_grade,age_at_entry,parent_education,family_income_bracket,language_proficiency,attention_span_score,social_skills_score,cognitive_test_score,has_learning_difficulty,parental_involvement_score,high_IGCSE_performance,gender_Male
0,0,3,3.3,2,2,2,2,4,85,0,1,1,1
1,1,11,7.8,0,2,1,4,4,110,0,3,1,0
2,2,11,4.8,2,2,1,1,2,123,0,3,1,1
3,3,2,3.2,2,2,2,4,2,99,0,5,1,0
4,4,11,7.0,1,2,2,4,3,73,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,9,4.5,1,0,1,3,5,114,0,5,1,1
496,496,11,4.5,2,2,2,3,5,130,1,2,1,1
497,497,6,4.2,0,1,0,5,3,75,0,4,1,0
498,498,9,3.9,3,2,0,3,1,93,0,1,1,0


In [74]:
if 'high_IGCSE_performance' not in df.columns:
    raise KeyError("Target column 'high_IGCSE_performance' not found.")
X = df.drop(columns=["high_IGCSE_performance"])
y = df["high_IGCSE_performance"]

In [75]:
feature_columns = X.columns.tolist()
with open("feature_columns.pkl", "wb") as f:
    pickle.dump(feature_columns, f)

In [76]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [55]:
selector = SelectKBest(score_func=f_classif, k=6)
X_selected = selector.fit_transform(X_balanced, y_balanced)
selected_features = [feature_columns[i] for i, keep in enumerate(selector.get_support()) if keep]
print("Selected features (k=6):", selected_features)

Selected features (k=6): ['parent_education', 'language_proficiency', 'attention_span_score', 'social_skills_score', 'cognitive_test_score', 'parental_involvement_score']


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_balanced, test_size=0.2, random_state=42)

# Define classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM Linear": SVC(kernel='linear', probability=True),
    "SVM Non-Linear": SVC(kernel='rbf', probability=True),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

In [57]:
accuracy_scores = []
trained_models = []
reports = []

In [58]:
for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    accuracy_scores.append(acc)
    trained_models.append(clf)
    reports.append(report)
    print(f"\n=== {name} ===")
    print(report)


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        98
           1       0.99      0.97      0.98        91

    accuracy                           0.98       189
   macro avg       0.98      0.98      0.98       189
weighted avg       0.98      0.98      0.98       189


=== SVM Linear ===
              precision    recall  f1-score   support

           0       0.97      0.98      0.97        98
           1       0.98      0.97      0.97        91

    accuracy                           0.97       189
   macro avg       0.97      0.97      0.97       189
weighted avg       0.97      0.97      0.97       189


=== SVM Non-Linear ===
              precision    recall  f1-score   support

           0       0.82      0.96      0.89        98
           1       0.95      0.78      0.86        91

    accuracy                           0.87       189
   macro avg       0.89      0.87      0.87       189
w

In [59]:
best_model_index = np.argmax(accuracy_scores)
best_model_name = list(models.keys())[best_model_index]
best_model = trained_models[best_model_index]
best_report = reports[best_model_index]
print(f"\nBest model by accuracy: {best_model_name}")


Best model by accuracy: Logistic Regression


In [61]:
if hasattr(best_model, "predict_proba"):
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)
    print("AUC Score:", auc_score)
else:
    print(f"{best_model_name} does not support predict_proba; skipping AUC.")

print("\nClassification Report for Best Model:")
print(best_report)

AUC Score: 0.99943933617403

Classification Report for Best Model:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        98
           1       0.99      0.97      0.98        91

    accuracy                           0.98       189
   macro avg       0.98      0.98      0.98       189
weighted avg       0.98      0.98      0.98       189



In [62]:
with open("feature_selector.pkl", "wb") as f:
    pickle.dump(selector, f)

with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

In [63]:
print("Saved: feature_selector.pkl, best_model.pkl, feature_columns.pkl")

Saved: feature_selector.pkl, best_model.pkl, feature_columns.pkl
