In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Pre-processing

In [2]:
file_path = "data.csv"
data = pd.read_csv(file_path, delimiter=";")

# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_columns = ["Marital status", "Daytime/evening attendance\t", "Gender", "Target"]
# Remove categorical columns from numerical processing
numerical_columns = [col for col in numerical_columns if col not in categorical_columns]

# Handle outliers in numerical columns using z-score (capping values beyond 3 standard deviations)
data[numerical_columns] = data[numerical_columns].apply(
    lambda x: x.clip(lower=x.mean() - 3 * x.std(), upper=x.mean() + 3 * x.std())
)

# Encoding categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns[:-1], drop_first=True)

# Standardize numerical columns for consistency across all classifiers
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# Apply PCA to retain 95% of variance
X = data_encoded.drop(columns=["Target"])
y = data_encoded["Target"]
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)

## Benchmarking

In [3]:
classifiers = {
    "kNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVC": SVC(probability=True), #enable probability estimates for SVC

    "LinearDiscriminant": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminant": QuadraticDiscriminantAnalysis(),

    "GaussianNB": GaussianNB(),

    "LogisticRegression": LogisticRegression(multi_class="multinomial", max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "Bagging": BaggingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}


N = 20 #number of experiments
prob_scores = []

for i in tqdm(range(N)):

    X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=i)
    
    # Map unique class labels to indices 
    class_labels = np.unique(y_train)
    label_to_index = {label: idx for idx, label in enumerate(class_labels)}
   
    current_scores = {"Experiment": i+1}
    for clf_name, clf in classifiers.items():

        clf.fit(X_train, y_train)
        if hasattr(clf, "predict_proba"):
            probs = clf.predict_proba(X_test) #!=clf.predict(X_test)
        else:
            # Platt scaling 
            calibrated_clf = CalibratedClassifierCV(clf, method="sigmoid", cv=5)
            calibrated_clf.fit(X_train, y_train)
            probs = calibrated_clf.predict_proba(X_test)
        
        # Derive predictions directly from highest probability class
        highest_prob_indices = probs.argmax(axis=1)
        preds = [class_labels[idx] for idx in highest_prob_indices]

        current_scores[f"Label ({clf_name})"] = preds
        current_scores[f"Prob ({clf_name})"] = probs.tolist()

    prob_scores.append(current_scores)

prob_scores_df = pd.DataFrame(prob_scores)
#import ace_tools as tools; tools.display_dataframe_to_user(name="Classifiers Benchmarking", dataframe=prob_scores_df)

100%|██████████| 20/20 [02:02<00:00,  6.14s/it]


In [4]:
prob_scores_df.head()

Unnamed: 0,Experiment,Label (kNN),Prob (kNN),Label (DecisionTree),Prob (DecisionTree),Label (SVC),Prob (SVC),Label (LinearDiscriminant),Prob (LinearDiscriminant),Label (QuadraticDiscriminant),...,Label (GaussianNB),Prob (GaussianNB),Label (LogisticRegression),Prob (LogisticRegression),Label (RandomForest),Prob (RandomForest),Label (Bagging),Prob (Bagging),Label (AdaBoost),Prob (AdaBoost)
0,1,"[Graduate, Dropout, Graduate, Enrolled, Dropou...","[[0.0, 0.2, 0.8], [1.0, 0.0, 0.0], [0.2, 0.0, ...","[Graduate, Dropout, Dropout, Graduate, Enrolle...","[[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [1.0, 0.0, ...","[Graduate, Dropout, Graduate, Graduate, Dropou...","[[0.05139014233908383, 0.2041335135569115, 0.7...","[Graduate, Dropout, Graduate, Graduate, Dropou...","[[0.024407995070653145, 0.17657345493775226, 0...","[Graduate, Dropout, Graduate, Graduate, Dropou...",...,"[Graduate, Dropout, Graduate, Graduate, Dropou...","[[0.03166207102080041, 0.17166944798393427, 0....","[Graduate, Dropout, Graduate, Graduate, Dropou...","[[0.04701450048507603, 0.1771524410484938, 0.7...","[Graduate, Dropout, Graduate, Graduate, Dropou...","[[0.11, 0.29, 0.6], [0.84, 0.1, 0.06], [0.04, ...","[Graduate, Dropout, Graduate, Graduate, Dropou...","[[0.2, 0.3, 0.5], [0.9, 0.1, 0.0], [0.0, 0.0, ...","[Graduate, Dropout, Graduate, Graduate, Dropou...","[[0.3246436768991152, 0.33478063848601713, 0.3..."
1,2,"[Dropout, Graduate, Graduate, Graduate, Dropou...","[[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.2, 0.0, ...","[Dropout, Graduate, Graduate, Graduate, Enroll...","[[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...","[Dropout, Graduate, Graduate, Graduate, Dropou...","[[0.9270497907839012, 0.06899573491367075, 0.0...","[Dropout, Graduate, Graduate, Graduate, Dropou...","[[0.9648554976432007, 0.011761313387859473, 0....","[Dropout, Graduate, Graduate, Graduate, Dropou...",...,"[Dropout, Graduate, Graduate, Graduate, Dropou...","[[0.9339876192247007, 0.008676454070069232, 0....","[Dropout, Graduate, Graduate, Graduate, Dropou...","[[0.9778277114500894, 0.021282138075454045, 0....","[Dropout, Graduate, Graduate, Graduate, Dropou...","[[0.98, 0.02, 0.0], [0.01, 0.02, 0.97], [0.07,...","[Dropout, Graduate, Graduate, Graduate, Enroll...","[[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...","[Dropout, Graduate, Graduate, Graduate, Dropou...","[[0.3546923202450973, 0.33118878407329394, 0.3..."
2,3,"[Graduate, Enrolled, Graduate, Dropout, Gradua...","[[0.0, 0.0, 1.0], [0.2, 0.4, 0.4], [0.0, 0.2, ...","[Enrolled, Graduate, Dropout, Enrolled, Gradua...","[[0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, ...","[Enrolled, Dropout, Graduate, Dropout, Graduat...","[[0.34439752324303724, 0.3923070019837975, 0.2...","[Enrolled, Graduate, Graduate, Dropout, Gradua...","[[0.23143518410300457, 0.46519272467780187, 0....","[Graduate, Enrolled, Graduate, Enrolled, Gradu...",...,"[Graduate, Enrolled, Graduate, Dropout, Dropou...","[[0.16338651899054482, 0.10685622854946918, 0....","[Dropout, Graduate, Graduate, Dropout, Graduat...","[[0.4304880675261239, 0.4088614674831752, 0.16...","[Graduate, Graduate, Graduate, Dropout, Gradua...","[[0.31, 0.29, 0.4], [0.22, 0.38, 0.4], [0.11, ...","[Enrolled, Enrolled, Graduate, Dropout, Gradua...","[[0.2, 0.5, 0.3], [0.3, 0.6, 0.1], [0.0, 0.2, ...","[Dropout, Graduate, Graduate, Dropout, Graduat...","[[0.3470712742220787, 0.32972338392155615, 0.3..."
3,4,"[Graduate, Dropout, Enrolled, Graduate, Enroll...","[[0.0, 0.2, 0.8], [0.6, 0.0, 0.4], [0.2, 0.4, ...","[Graduate, Dropout, Enrolled, Graduate, Gradua...","[[0.0, 0.0, 1.0], [1.0, 0.0, 0.0], [0.0, 1.0, ...","[Graduate, Dropout, Enrolled, Graduate, Gradua...","[[0.031186375530215606, 0.06699681751865616, 0...","[Graduate, Dropout, Enrolled, Graduate, Gradua...","[[0.011121206936761344, 0.07372711292091684, 0...","[Graduate, Dropout, Enrolled, Graduate, Gradua...",...,"[Graduate, Graduate, Graduate, Graduate, Gradu...","[[0.015767301616952122, 0.04422554303051563, 0...","[Graduate, Dropout, Enrolled, Graduate, Gradua...","[[0.033226943372910954, 0.0890065461048606, 0....","[Graduate, Dropout, Enrolled, Graduate, Enroll...","[[0.06, 0.21, 0.73], [0.61, 0.23, 0.16], [0.24...","[Graduate, Dropout, Enrolled, Graduate, Gradua...","[[0.3, 0.3, 0.4], [0.5, 0.2, 0.3], [0.2, 0.6, ...","[Graduate, Dropout, Enrolled, Graduate, Gradua...","[[0.32018077019124597, 0.3261220063473578, 0.3..."
4,5,"[Graduate, Enrolled, Dropout, Graduate, Gradua...","[[0.0, 0.2, 0.8], [0.2, 0.4, 0.4], [0.4, 0.4, ...","[Dropout, Graduate, Dropout, Graduate, Dropout...","[[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, ...","[Enrolled, Graduate, Dropout, Graduate, Dropou...","[[0.13086413511237013, 0.5668339782324031, 0.3...","[Graduate, Graduate, Dropout, Graduate, Dropou...","[[0.07553510120610431, 0.28704811250234297, 0....","[Enrolled, Graduate, Dropout, Graduate, Dropou...",...,"[Graduate, Graduate, Dropout, Graduate, Dropou...","[[0.07320939165191916, 0.22888894545667926, 0....","[Graduate, Graduate, Dropout, Graduate, Dropou...","[[0.1653643814707791, 0.26576458761634714, 0.5...","[Enrolled, Graduate, Dropout, Graduate, Dropou...","[[0.17, 0.42, 0.41], [0.11, 0.16, 0.73], [0.53...","[Enrolled, Graduate, Dropout, Graduate, Dropou...","[[0.3, 0.5, 0.2], [0.1, 0.2, 0.7], [0.4, 0.3, ...","[Dropout, Graduate, Dropout, Graduate, Dropout...","[[0.34406732267016604, 0.33552318019733374, 0...."


In [6]:
experiment = 1  
classifier_name = "LogisticRegression"  

prob_scores = prob_scores_df.loc[
    prob_scores_df["Experiment"]==experiment, f"Prob ({classifier_name})"
].values

print(sum(prob_scores[0][0]))

1.0
