In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Pre-processing

In [2]:
file_path = "data.csv"
data = pd.read_csv(file_path, delimiter=";")
data = data[data['Target'] != 'Enrolled'] #only considering 'Dropout' and 'Graduate'

# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_columns = ["Marital status", "Daytime/evening attendance\t", "Gender", "Target"]
# Remove categorical columns from numerical processing
numerical_columns = [col for col in numerical_columns if col not in categorical_columns]

# Handle outliers in numerical columns using z-score (capping values beyond 3 standard deviations)
data[numerical_columns] = data[numerical_columns].apply(
    lambda x: x.clip(lower=x.mean() - 3 * x.std(), upper=x.mean() + 3 * x.std())
)

# Encoding categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns[:-1], drop_first=True)

# Standardize numerical columns for consistency across all classifiers
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

target_mapping = {0: "Dropout", 1: "Graduate"}
# Apply PCA to retain 95% of variance
X = data_encoded.drop(columns=["Target"])
y = data_encoded["Target"]
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)

## Benchmarking

In [3]:
classifiers = {
    "kNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVC": SVC(probability=True), #enable probability estimates for SVC

    "LinearDiscriminant": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminant": QuadraticDiscriminantAnalysis(),

    "GaussianNB": GaussianNB(),

    "LogisticRegression": LogisticRegression(multi_class="multinomial", max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "Bagging": BaggingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y) #for having same proportions of each class through folds (dataset unbalanced!)
results = []

tqdm_iter = tqdm(classifiers.items())
for clf_name, clf in tqdm_iter:

    if not hasattr(clf, "predict_proba"):
        clf = CalibratedClassifierCV(clf)
    
    probs = cross_val_predict(clf, X_test, y_test, cv=5, method='predict_proba')
    preds = np.argmax(probs, axis=1)
    max_probs = np.max(probs, axis=1)
    
    for i, (pred, prob) in enumerate(zip(preds, max_probs)):

        if len(results) <= i:

            results.append({})
        label = target_mapping[pred]
        results[i][f"{clf_name}_Label"] = label
        results[i][f"{clf_name}_ProbScore"] = prob


results_df = pd.DataFrame(results)

# Add true labels 
results_df["True_Label"] = list(y_test)

100%|██████████| 10/10 [00:03<00:00,  2.57it/s]


In [4]:
results_df.head()

Unnamed: 0,kNN_Label,kNN_ProbScore,DecisionTree_Label,DecisionTree_ProbScore,SVC_Label,SVC_ProbScore,LinearDiscriminant_Label,LinearDiscriminant_ProbScore,QuadraticDiscriminant_Label,QuadraticDiscriminant_ProbScore,...,GaussianNB_ProbScore,LogisticRegression_Label,LogisticRegression_ProbScore,RandomForest_Label,RandomForest_ProbScore,Bagging_Label,Bagging_ProbScore,AdaBoost_Label,AdaBoost_ProbScore,True_Label
0,Graduate,0.6,Graduate,1.0,Graduate,0.512522,Graduate,0.852367,Dropout,0.999081,...,0.993331,Graduate,0.662619,Dropout,0.5,Graduate,0.9,Graduate,0.513759,Dropout
1,Graduate,1.0,Graduate,1.0,Graduate,0.946354,Graduate,0.986785,Graduate,1.0,...,0.861029,Graduate,0.941335,Graduate,0.83,Graduate,1.0,Graduate,0.522424,Graduate
2,Graduate,1.0,Graduate,1.0,Graduate,0.927952,Graduate,0.990851,Graduate,0.999999,...,0.991078,Graduate,0.977961,Graduate,0.78,Graduate,0.7,Graduate,0.537284,Graduate
3,Graduate,0.8,Graduate,1.0,Graduate,0.935147,Graduate,0.980752,Graduate,0.998666,...,0.986587,Graduate,0.932972,Graduate,0.88,Graduate,0.9,Graduate,0.534127,Graduate
4,Graduate,0.6,Graduate,1.0,Graduate,0.973975,Graduate,0.995383,Graduate,1.0,...,0.997634,Graduate,0.983171,Graduate,0.91,Graduate,0.8,Graduate,0.545851,Graduate


In [5]:
results_df.to_csv("results_table.csv", index=False)

### Analysis

In [6]:
print(len(y_test))

726


In [7]:
label_counts = pd.Series(y_test).value_counts()

print(f"Graduate: {label_counts.get('Graduate', 0)}")
print(f"Dropout: {label_counts.get('Dropout', 0)}")

Graduate: 442
Dropout: 284


In [8]:
mismatch_counts = {}

for clf_name in classifiers.keys():

    mismatch_counts[clf_name] = results_df.apply(
        lambda row: row[f"{clf_name}_Label"] != row["True_Label"] 
        if pd.notna(row[f"{clf_name}_Label"]) else False, 
        axis=1
    ).sum()

for clf_name, count in mismatch_counts.items():
    
    print(f"{clf_name}: {count} mismatches")

kNN: 122 mismatches
DecisionTree: 160 mismatches
SVC: 94 mismatches
LinearDiscriminant: 88 mismatches
QuadraticDiscriminant: 108 mismatches
GaussianNB: 145 mismatches
LogisticRegression: 84 mismatches
RandomForest: 91 mismatches
Bagging: 105 mismatches
AdaBoost: 110 mismatches
