In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, chi2
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import GridSearchCV

In [5]:
dataset = pd.read_csv("student_igcse_dataset.csv")
dataset

Unnamed: 0,student_id,entry_grade,age_at_entry,gender,parent_education,family_income_bracket,language_proficiency,attention_span_score,social_skills_score,cognitive_test_score,has_learning_difficulty,parental_involvement_score,high_IGCSE_performance
0,S1000,3rd,3.3,Male,Master's,Middle,Medium,2,4,85,0,1,1
1,S1001,UKG,7.8,Female,Bachelor's,Middle,Low,4,4,110,0,3,1
2,S1002,UKG,4.8,Male,Master's,Middle,Low,1,2,123,0,3,1
3,S1003,2nd,3.2,Female,Master's,Middle,Medium,4,2,99,0,5,1
4,S1004,UKG,7.0,Female,High School,Middle,Medium,4,3,73,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,S1495,9th,4.5,Male,High School,High,Low,3,5,114,0,5,1
496,S1496,UKG,4.5,Male,Master's,Middle,Medium,3,5,130,1,2,1
497,S1497,6th,4.2,Female,Bachelor's,Low,High,5,3,75,0,4,1
498,S1498,9th,3.9,Female,PhD,Middle,High,3,1,93,0,1,1


In [6]:
dataset = pd.get_dummies(dataset, drop_first=True)
dataset

Unnamed: 0,age_at_entry,attention_span_score,social_skills_score,cognitive_test_score,has_learning_difficulty,parental_involvement_score,high_IGCSE_performance,student_id_S1001,student_id_S1002,student_id_S1003,...,entry_grade_LKG,entry_grade_UKG,gender_Male,parent_education_High School,parent_education_Master's,parent_education_PhD,family_income_bracket_Low,family_income_bracket_Middle,language_proficiency_Low,language_proficiency_Medium
0,3.3,2,4,85,0,1,1,0,0,0,...,0,0,1,0,1,0,0,1,0,1
1,7.8,4,4,110,0,3,1,1,0,0,...,0,1,0,0,0,0,0,1,1,0
2,4.8,1,2,123,0,3,1,0,1,0,...,0,1,1,0,1,0,0,1,1,0
3,3.2,4,2,99,0,5,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1
4,7.0,4,3,73,0,1,1,0,0,0,...,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,4.5,3,5,114,0,5,1,0,0,0,...,0,0,1,1,0,0,0,0,1,0
496,4.5,3,5,130,1,2,1,0,0,0,...,0,1,1,0,1,0,0,1,0,1
497,4.2,5,3,75,0,4,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
498,3.9,3,1,93,0,1,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [7]:
dataset.columns

Index(['age_at_entry', 'attention_span_score', 'social_skills_score',
       'cognitive_test_score', 'has_learning_difficulty',
       'parental_involvement_score', 'high_IGCSE_performance',
       'student_id_S1001', 'student_id_S1002', 'student_id_S1003',
       ...
       'entry_grade_LKG', 'entry_grade_UKG', 'gender_Male',
       'parent_education_High School', 'parent_education_Master's',
       'parent_education_PhD', 'family_income_bracket_Low',
       'family_income_bracket_Middle', 'language_proficiency_Low',
       'language_proficiency_Medium'],
      dtype='object', length=525)

In [8]:
dataset.isnull().sum()

age_at_entry                    0
attention_span_score            0
social_skills_score             0
cognitive_test_score            0
has_learning_difficulty         0
                               ..
parent_education_PhD            0
family_income_bracket_Low       0
family_income_bracket_Middle    0
language_proficiency_Low        0
language_proficiency_Medium     0
Length: 525, dtype: int64

In [9]:
indep_X = dataset.drop('high_IGCSE_performance', axis=1)
dep_Y = dataset['high_IGCSE_performance']

In [10]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [11]:
def forwardFeature(indep_X, dep_Y, k):
    forwardlist = []

    log_model = LogisticRegression(solver='lbfgs', max_iter=1000)
    svc_model = SVC(kernel='linear')
    rf = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    dt = DecisionTreeClassifier(criterion='entropy', random_state=0)

    models = [log_model, svc_model, rf, dt]

    for model in models:
        print(f"Running Forward Selection for: {model.__class__.__name__}")
        sfs = SFS(model,
                  k_features=k,
                  forward=True,
                  floating=False,
                  scoring='accuracy',
                  cv=5,
                  n_jobs=1)  # safer than using all CPUs
        sfs = sfs.fit(indep_X, dep_Y)
        selected_features = list(sfs.k_feature_idx_)
        forward_features = indep_X.iloc[:, selected_features]
        forwardlist.append(forward_features)

    return forwardlist

In [12]:
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

def cm_prediction(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return classifier, Accuracy, report, cm

def logistic(X_train, y_train, X_test, y_test):
    classifier = LogisticRegression(random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def svm_linear(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def svm_NL(X_train, y_train, X_test, y_test):
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def knn(X_train, y_train, X_test, y_test):
    classifier = KNeighborsClassifier(n_neighbors=5)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def Navie(X_train, y_train, X_test, y_test):
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def Decision(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def random(X_train, y_train, X_test, y_test):
    classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)
    return cm_prediction(classifier, X_test, y_test)

def forward_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf):
    dataFrame = pd.DataFrame(index=['ForwardSelector'], columns=[
        'Logistic', 'SVM1', 'SVMn1', 'KNN', 'Navie', 'Decision', 'Random'
    ])
    
    for number, idex in enumerate(dataFrame.index):
        dataFrame['Logistic'][idex] = acclog[number]
        dataFrame['SVM1'][idex] = accsvml[number]
        dataFrame['SVMn1'][idex] = accsvmnl[number]
        dataFrame['KNN'][idex] = accknn[number]
        dataFrame['Navie'][idex] = accnav[number]
        dataFrame['Decision'][idex] = accdes[number]
        dataFrame['Random'][idex] = accrf[number]

    return dataFrame

In [13]:
acclog = []
accsvml = []
accsvmnl = []
accknn = []
accnav = []
accdes = []
accrf = []

In [14]:
forward_selected_features = forwardFeature(indep_X, dep_Y, 3)

Running Forward Selection for: LogisticRegression
Running Forward Selection for: SVC
Running Forward Selection for: RandomForestClassifier
Running Forward Selection for: DecisionTreeClassifier


In [15]:
X_train, X_test, y_train, y_test = split_scalar(forward_selected_features[0], dep_Y)

classifier, Accuracy, report, cm = logistic(X_train, y_train, X_test, y_test)
acclog.append(Accuracy)

classifier, Accuracy, report, cm = svm_linear(X_train, y_train, X_test, y_test)
accsvml.append(Accuracy)

classifier, Accuracy, report, cm = svm_NL(X_train, y_train, X_test, y_test)
accsvmnl.append(Accuracy)

classifier, Accuracy, report, cm = knn(X_train, y_train, X_test, y_test)
accknn.append(Accuracy)

classifier, Accuracy, report, cm = Navie(X_train, y_train, X_test, y_test)
accnav.append(Accuracy)

classifier, Accuracy, report, cm = Decision(X_train, y_train, X_test, y_test)
accdes.append(Accuracy)

classifier, Accuracy, report, cm = random(X_train, y_train, X_test, y_test)
accrf.append(Accuracy)

result = forward_Classification(acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
result

Unnamed: 0,Logistic,SVM1,SVMn1,KNN,Navie,Decision,Random
ForwardSelector,0.96,0.96,0.96,0.944,0.944,0.928,0.936


In [17]:
models = [acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf]
model_names = ['Logistic Regression', 'SVM Linear', 'SVM Non-Linear', 'KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest']
best_model_index = np.argmax([np.mean(model) for model in models])
best_model_name = model_names[best_model_index]

print("Best Model: ", best_model_name)

# Execute the best model
if best_model_name == 'Logistic Regression':
    classifier, Accuracy, report, cm = logistic(X_train, y_train, X_test, y_test)
elif best_model_name == 'SVM Linear':
    classifier, Accuracy, report, cm = svm_linear(X_train, y_train, X_test, y_test)
elif best_model_name == 'SVM Non-Linear':
    classifier, Accuracy, report, cm = svm_NL(X_train, y_train, X_test, y_test)
elif best_model_name == 'KNN':
    classifier, Accuracy, report, cm = knn(X_train, y_train, X_test, y_test)
elif best_model_name == 'Naive Bayes':
    classifier, Accuracy, report, cm = Navie(X_train, y_train, X_test, y_test)
elif best_model_name == 'Decision Tree':
    classifier, Accuracy, report, cm = Decision(X_train, y_train, X_test, y_test)
elif best_model_name == 'Random Forest':
    classifier, Accuracy, report, cm = random(X_train, y_train, X_test, y_test)

# Calculate AUC score
from sklearn.metrics import roc_auc_score
y_pred_proba = classifier.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC Score: ", auc_score)

# Print classification report
print("Classification Report:")
print(report)

Best Model:  Logistic Regression
AUC Score:  0.8983333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.20      0.29         5
           1       0.97      0.99      0.98       120

    accuracy                           0.96       125
   macro avg       0.73      0.60      0.63       125
weighted avg       0.95      0.96      0.95       125



In [19]:
class_counts = dataset['high_IGCSE_performance'].value_counts()
print(class_counts)

1    472
0     28
Name: high_IGCSE_performance, dtype: int64


In [21]:
!pip install imbalanced-learn



In [23]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [24]:
df = pd.read_csv("student_igcse_dataset.csv")

In [25]:
X = df.drop(columns=["student_id", "high_IGCSE_performance"])
y = df["high_IGCSE_performance"]

In [26]:
dataset = pd.get_dummies(dataset, drop_first=True)

In [28]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(dataset, y)

In [29]:
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_balanced, y_balanced)
selected_features = dataset.columns[selector.get_support()]


  f = msb / msw


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_balanced, test_size=0.2, random_state=42)

# Try different classifiers
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier()
}

In [31]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))



=== Random Forest ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00        91

    accuracy                           1.00       189
   macro avg       1.00      1.00      1.00       189
weighted avg       1.00      1.00      1.00       189


=== SVM ===
              precision    recall  f1-score   support

           0       0.80      0.91      0.85        98
           1       0.88      0.76      0.82        91

    accuracy                           0.84       189
   macro avg       0.84      0.83      0.83       189
weighted avg       0.84      0.84      0.83       189


=== Logistic Regression ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00        91

    accuracy                           1.00       189
   macro avg       1.00      1.00      1.00       189
weighted 

In [33]:
models = [acclog, accsvml, accsvmnl, accknn, accnav, accdes, accrf]
model_names = ['Logistic Regression', 'SVM Linear', 'SVM Non-Linear', 'KNN', 'Naive Bayes', 'Decision Tree', 'Random Forest']
best_model_index = np.argmax([np.mean(model) for model in models])
best_model_name = model_names[best_model_index]

print("Best Model: ", best_model_name)

# Execute the best model
if best_model_name == 'Logistic Regression':
    classifier, Accuracy, report, cm = logistic(X_train, y_train, X_test, y_test)
elif best_model_name == 'SVM Linear':
    classifier, Accuracy, report, cm = svm_linear(X_train, y_train, X_test, y_test)
elif best_model_name == 'SVM Non-Linear':
    classifier, Accuracy, report, cm = svm_NL(X_train, y_train, X_test, y_test)
elif best_model_name == 'KNN':
    classifier, Accuracy, report, cm = knn(X_train, y_train, X_test, y_test)
elif best_model_name == 'Naive Bayes':
    classifier, Accuracy, report, cm = Navie(X_train, y_train, X_test, y_test)
elif best_model_name == 'Decision Tree':
    classifier, Accuracy, report, cm = Decision(X_train, y_train, X_test, y_test)
elif best_model_name == 'Random Forest':
    classifier, Accuracy, report, cm = random(X_train, y_train, X_test, y_test)

# Calculate AUC score
from sklearn.metrics import roc_auc_score
y_pred_proba = classifier.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
print("AUC Score: ", auc_score)

# Print classification report
print("Classification Report:")
print(report)

Best Model:  Logistic Regression
AUC Score:  0.9999999999999999
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00        91

    accuracy                           1.00       189
   macro avg       1.00      1.00      1.00       189
weighted avg       1.00      1.00      1.00       189



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
