In [33]:
import pandas as pd
import numpy as np
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [34]:
df = pd.read_csv("student_igcse_dataset.csv")
df

Unnamed: 0,student_id,entry_grade,age_at_entry,gender,parent_education,family_income_bracket,language_proficiency,attention_span_score,social_skills_score,cognitive_test_score,has_learning_difficulty,parental_involvement_score,high_IGCSE_performance
0,S1000,3rd,3.3,Male,Master's,Middle,Medium,2,4,85,0,1,1
1,S1001,UKG,7.8,Female,Bachelor's,Middle,Low,4,4,110,0,3,1
2,S1002,UKG,4.8,Male,Master's,Middle,Low,1,2,123,0,3,1
3,S1003,2nd,3.2,Female,Master's,Middle,Medium,4,2,99,0,5,1
4,S1004,UKG,7.0,Female,High School,Middle,Medium,4,3,73,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,S1495,9th,4.5,Male,High School,High,Low,3,5,114,0,5,1
496,S1496,UKG,4.5,Male,Master's,Middle,Medium,3,5,130,1,2,1
497,S1497,6th,4.2,Female,Bachelor's,Low,High,5,3,75,0,4,1
498,S1498,9th,3.9,Female,PhD,Middle,High,3,1,93,0,1,1


In [35]:
le = LabelEncoder()
df['student_id'] = le.fit_transform(df['student_id'])

In [36]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col != 'student_id']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [37]:
print(df.head())

   student_id  age_at_entry  attention_span_score  social_skills_score  \
0           0           3.3                     2                    4   
1           1           7.8                     4                    4   
2           2           4.8                     1                    2   
3           3           3.2                     4                    2   
4           4           7.0                     4                    3   

   cognitive_test_score  has_learning_difficulty  parental_involvement_score  \
0                    85                        0                           1   
1                   110                        0                           3   
2                   123                        0                           3   
3                    99                        0                           5   
4                    73                        0                           1   

   high_IGCSE_performance  entry_grade_1st  entry_grade_2nd  ...  \
0     

In [38]:
X = df.drop(columns=["high_IGCSE_performance"])
y = df["high_IGCSE_performance"]

In [39]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [40]:
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_balanced, y_balanced)
selected_features = X.columns[selector.get_support()]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_balanced, test_size=0.2, random_state=42)

# Define classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM Linear": SVC(kernel='linear', probability=True),
    "SVM Non-Linear": SVC(kernel='rbf', probability=True),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}


In [42]:
accuracy_scores = []
trained_models = []
reports = []

In [43]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    accuracy_scores.append(acc)
    trained_models.append(model)
    reports.append(report)
    print(f"\n=== {name} ===")
    print(report)


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        98
           1       0.99      0.97      0.98        91

    accuracy                           0.98       189
   macro avg       0.98      0.98      0.98       189
weighted avg       0.98      0.98      0.98       189


=== SVM Linear ===
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        98
           1       0.98      0.99      0.98        91

    accuracy                           0.98       189
   macro avg       0.98      0.98      0.98       189
weighted avg       0.98      0.98      0.98       189


=== SVM Non-Linear ===
              precision    recall  f1-score   support

           0       0.79      0.91      0.85        98
           1       0.88      0.75      0.81        91

    accuracy                           0.83       189
   macro avg       0.84      0.83      0.83       189
w

In [48]:
best_model_index = np.argmax(accuracy_scores)
best_model_name = list(models.keys())[best_model_index]
best_model = trained_models[best_model_index]
best_report = reports[best_model_index]

In [46]:
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC Score:", auc_score)

AUC Score: 0.999663601704418


In [50]:
print("Classification Report:")
print("\nBest Model:", best_model_name)
print(best_report)

Classification Report:

Best Model: SVM Linear
              precision    recall  f1-score   support

           0       0.99      0.98      0.98        98
           1       0.98      0.99      0.98        91

    accuracy                           0.98       189
   macro avg       0.98      0.98      0.98       189
weighted avg       0.98      0.98      0.98       189



In [51]:
with open("best_model.pkl", "wb") as file:
    pickle.dump(best_model, file)
print("Best model saved as 'best_model.pkl'")

Best model saved as 'best_model.pkl'
