In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import make_pipeline
# Linear Models
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import PolynomialFeatures

# Linear & Quadratic Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Support Vector Machines
from sklearn.svm import LinearSVC, SVC

# Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem

# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, CategoricalNB

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Additional
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Pre-processing

In [2]:
file_path = "data.csv"
data = pd.read_csv(file_path, delimiter=";")
data = data[data['Target'] != 'Enrolled'] #only considering 'Dropout' and 'Graduate'


# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_columns = ["Marital status", "Daytime/evening attendance\t", "Gender", "Target"]

# Handle outliers in numerical columns using z-score (capping values beyond 3 standard deviations)
numerical_columns = [col for col in numerical_columns if col not in categorical_columns]
data[numerical_columns] = data[numerical_columns].apply(
    lambda x: x.clip(lower=x.mean() - 3 * x.std(), upper=x.mean() + 3 * x.std())
) #column-wise

# Encoding categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns[:-1], drop_first=True)

# Standardize numerical columns for consistency across all classifiers
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# Apply PCA to retain 95% of variance
X = data_encoded.drop(columns=["Target"])
y = data_encoded["Target"]
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)

## Benchmarking

In [3]:
classifiers = {
    "LogisticRegression": LogisticRegression(),
    "RidgeClassifier": RidgeClassifier(), 

    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
    "ShrinkageLDA": LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto"), 
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "RegularizedQDA": QuadraticDiscriminantAnalysis(reg_param=0.5), #reg points? 

    "LinearSVC": LinearSVC(),
    "SVC_rbf": SVC(kernel='rbf', probability=True), 

    "HingeSGDClassifier": SGDClassifier(loss='hinge'),
    "LogLossSGDClassifier": SGDClassifier(loss='log_loss'),
    "PolynomialSGDClassifier": make_pipeline(PolynomialFeatures(), SGDClassifier()), 

    "GaussianNB": GaussianNB(),
    #"MultinomialNB": MultinomialNB(), different data scale --> different train/test sets
    #"ComplementNB": ComplementNB(), 
    #"CategoricalNB": CategoricalNB(), 

    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=1),
    "FullDecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),

    "KNeighborsClassifier": KNeighborsClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier()
}
# Classifiers needing calibration due to unreliable probabilities
requires_calibration = {
    "DecisionTreeClassifier",
    "FullDecisionTreeClassifier",
    "KNeighborsClassifier",
    "AdaBoostClassifier",
    "BaggingClassifier"   
}

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y) #for having same proportions of each class through folds (dataset unbalanced!)
target_mapping = {"Dropout": 0, "Graduate": 1}
results = []

tqdm_iter = tqdm(classifiers.items())
for clf_name, clf in tqdm_iter:

    if not hasattr(clf, "predict_proba") or clf_name in requires_calibration:
        print(clf_name)
        clf = CalibratedClassifierCV(clf, method='sigmoid')
    
    probs = cross_val_predict(clf, X_test, y_test, cv=5, method='predict_proba')
    preds = np.argmax(probs, axis=1)
    dropout_probs = [prob[0] for prob in probs]
    for i, (pred, prob) in enumerate(zip(preds, dropout_probs)):

        if len(results) <= i:
            true_label = y_test.iloc[i]
            results.append({"True_Label": true_label})

        pred_label = "Dropout" if pred == target_mapping["Dropout"] else "Graduate"
        results[i][f"{clf_name}_conf"] = prob
        results[i][f"{clf_name}_pred"] = 1 if pred_label == "Dropout" else 0


results_df = pd.DataFrame(results)

 11%|█         | 2/18 [00:00<00:01,  9.22it/s]

RidgeClassifier


 39%|███▉      | 7/18 [00:00<00:00, 18.13it/s]

LinearSVC


 50%|█████     | 9/18 [00:00<00:01,  8.76it/s]

HingeSGDClassifier
PolynomialSGDClassifier


 72%|███████▏  | 13/18 [00:01<00:00,  8.34it/s]

DecisionTreeClassifier
FullDecisionTreeClassifier


 83%|████████▎ | 15/18 [00:03<00:01,  2.81it/s]

KNeighborsClassifier


 89%|████████▉ | 16/18 [00:03<00:00,  2.57it/s]

BaggingClassifier


 94%|█████████▍| 17/18 [00:06<00:00,  1.29it/s]

AdaBoostClassifier


100%|██████████| 18/18 [00:11<00:00,  1.59it/s]


In [4]:
results_df.head()

Unnamed: 0,True_Label,LogisticRegression_conf,LogisticRegression_pred,RidgeClassifier_conf,RidgeClassifier_pred,LinearDiscriminantAnalysis_conf,LinearDiscriminantAnalysis_pred,ShrinkageLDA_conf,ShrinkageLDA_pred,QuadraticDiscriminantAnalysis_conf,...,FullDecisionTreeClassifier_conf,FullDecisionTreeClassifier_pred,RandomForestClassifier_conf,RandomForestClassifier_pred,KNeighborsClassifier_conf,KNeighborsClassifier_pred,BaggingClassifier_conf,BaggingClassifier_pred,AdaBoostClassifier_conf,AdaBoostClassifier_pred
0,Dropout,0.341509,0,0.29744,0,0.147633,0,0.319817,0,0.9990813,...,0.281714,0,0.48,0,0.492927,0,0.160978,0,0.374157,0
1,Graduate,0.061429,0,0.069597,0,0.013215,0,0.043883,0,1.62035e-07,...,0.196472,0,0.18,0,0.098991,0,0.164925,0,0.119818,0
2,Graduate,0.024072,0,0.052384,0,0.009149,0,0.052966,0,1.18724e-06,...,0.493762,0,0.29,0,0.127961,0,0.134008,0,0.120937,0
3,Graduate,0.070227,0,0.081399,0,0.019248,0,0.057954,0,0.001334457,...,0.196472,0,0.15,0,0.201423,0,0.104056,0,0.151312,0
4,Graduate,0.017772,0,0.032749,0,0.004617,0,0.027052,0,2.525464e-07,...,0.281714,0,0.09,0,0.37425,0,0.104344,0,0.170534,0


In [5]:
results_df["SVC_rbf_conf"].head(), results_df["SVC_rbf_pred"].head()

(0    0.469733
 1    0.059526
 2    0.078305
 3    0.071009
 4    0.030366
 Name: SVC_rbf_conf, dtype: float64,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: SVC_rbf_pred, dtype: int64)

In [6]:
results_df.to_csv("results_table.csv", index=False)

### Analysis

In [7]:
print(len(y_test))

726


In [8]:
label_counts = pd.Series(y_test).value_counts()

print(f"Graduate: {label_counts.get('Graduate', 0)}")
print(f"Dropout: {label_counts.get('Dropout', 0)}")

Graduate: 442
Dropout: 284


In [9]:
mismatches = {}

for clf_name in classifiers.keys():
    # Compare predicted labels with true labels (mapped)
    mismatch_count = (results_df[f"{clf_name}_pred"] != results_df["True_Label"].map(lambda x: 1 if x == "Dropout" else 0)).sum()
    mismatches[clf_name] = mismatch_count

for clf_name, count in mismatches.items():
    
    print(f"{clf_name}: {count} mismatches")

LogisticRegression: 84 mismatches
RidgeClassifier: 85 mismatches
LinearDiscriminantAnalysis: 88 mismatches
ShrinkageLDA: 89 mismatches
QuadraticDiscriminantAnalysis: 108 mismatches
RegularizedQDA: 118 mismatches
LinearSVC: 82 mismatches
SVC_rbf: 94 mismatches
HingeSGDClassifier: 101 mismatches
LogLossSGDClassifier: 104 mismatches
PolynomialSGDClassifier: 145 mismatches
GaussianNB: 145 mismatches
DecisionTreeClassifier: 141 mismatches
FullDecisionTreeClassifier: 119 mismatches
RandomForestClassifier: 93 mismatches
KNeighborsClassifier: 118 mismatches
BaggingClassifier: 97 mismatches
AdaBoostClassifier: 95 mismatches
