In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Pre-processing

In [2]:
file_path = "data.csv"
data = pd.read_csv(file_path, delimiter=";")

# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_columns = ["Marital status", "Daytime/evening attendance\t", "Gender", "Target"]
# Remove categorical columns from numerical processing
numerical_columns = [col for col in numerical_columns if col not in categorical_columns]

# Handle outliers in numerical columns using z-score (capping values beyond 3 standard deviations)
data[numerical_columns] = data[numerical_columns].apply(
    lambda x: x.clip(lower=x.mean() - 3 * x.std(), upper=x.mean() + 3 * x.std())
)

# Encoding categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns[:-1], drop_first=True)

# Standardize numerical columns for consistency across all classifiers
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

target_mapping = {0: "Dropout", 1: "Enrolled", 2: "Graduate"}
# Apply PCA to retain 95% of variance
X = data_encoded.drop(columns=["Target"])
y = data_encoded["Target"]
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)

## Benchmarking

In [3]:
classifiers = {
    "kNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVC": SVC(probability=True), #enable probability estimates for SVC

    "LinearDiscriminant": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminant": QuadraticDiscriminantAnalysis(),

    "GaussianNB": GaussianNB(),

    "LogisticRegression": LogisticRegression(multi_class="multinomial", max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "Bagging": BaggingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y) #for having same proportions of each class through folds (dataset unbalanced!)
results = []

tqdm_iter = tqdm(classifiers.items())
for clf_name, clf in tqdm_iter:

    if not hasattr(clf, "predict_proba"):
        clf = CalibratedClassifierCV(clf)
    
    probs = cross_val_predict(clf, X_test, y_test, cv=5, method='predict_proba')
    preds = np.argmax(probs, axis=1)
    max_probs = np.max(probs, axis=1)
    
    for i, (pred, prob) in enumerate(zip(preds, max_probs)):

        if len(results) <= i:

            results.append({})
        label = target_mapping[pred]
        if label == "Enrolled":

            results[i][f"{clf_name}_Label"] = np.nan
            results[i][f"{clf_name}_ProbScore"] = np.nan
        else:

            results[i][f"{clf_name}_Label"] = label
            results[i][f"{clf_name}_ProbScore"] = prob


results_df = pd.DataFrame(results)

# Add true labels 
results_df["True_Label"] = list(y_test)

100%|██████████| 10/10 [00:07<00:00,  1.26it/s]


In [5]:
results_df.head()

Unnamed: 0,kNN_Label,kNN_ProbScore,DecisionTree_Label,DecisionTree_ProbScore,SVC_Label,SVC_ProbScore,LinearDiscriminant_Label,LinearDiscriminant_ProbScore,QuadraticDiscriminant_Label,QuadraticDiscriminant_ProbScore,...,GaussianNB_ProbScore,LogisticRegression_Label,LogisticRegression_ProbScore,RandomForest_Label,RandomForest_ProbScore,Bagging_Label,Bagging_ProbScore,AdaBoost_Label,AdaBoost_ProbScore,True_Label
0,Graduate,0.8,Dropout,1.0,Graduate,0.879428,Graduate,0.867365,Graduate,0.996525,...,0.911931,Graduate,0.856286,Graduate,0.68,Graduate,0.6,Graduate,0.360882,Graduate
1,Graduate,0.8,Graduate,1.0,Graduate,0.789532,Graduate,0.760473,Graduate,0.959568,...,0.894778,Graduate,0.655494,Graduate,0.75,Graduate,0.9,Graduate,0.351642,Graduate
2,Dropout,0.4,,,Dropout,0.663856,Dropout,0.675671,,,...,,Dropout,0.699086,Dropout,0.58,Dropout,0.8,Dropout,0.361247,Enrolled
3,Graduate,1.0,Graduate,1.0,Graduate,0.952936,Graduate,0.953419,Graduate,0.992552,...,0.956069,Graduate,0.933668,Graduate,0.82,Graduate,0.6,Graduate,0.357666,Graduate
4,Graduate,0.8,Graduate,1.0,Graduate,0.915668,Graduate,0.906672,Graduate,0.912792,...,0.853458,Graduate,0.818331,Graduate,0.72,Graduate,0.8,Graduate,0.348888,Graduate


In [7]:
results_df.to_csv("results_table.csv", index=False)