In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors  import NearestCentroid
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.calibration import CalibratedClassifierCV

from sklearn.pipeline import make_pipeline
# Linear Models
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.preprocessing import PolynomialFeatures

# Linear & Quadratic Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Support Vector Machines
from sklearn.svm import LinearSVC, SVC

# Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem

# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, CategoricalNB

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Additional
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Pre-processing

In [3]:
file_path = "data.csv"
data = pd.read_csv(file_path, delimiter=";")
data = data[data['Target'] != 'Enrolled'] #only considering 'Dropout' and 'Graduate'

# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_columns = ["Marital status", "Daytime/evening attendance\t", "Gender", "Target"]
# Remove categorical columns from numerical processing
numerical_columns = [col for col in numerical_columns if col not in categorical_columns]

# Handle outliers in numerical columns using z-score (capping values beyond 3 standard deviations)
data[numerical_columns] = data[numerical_columns].apply(
    lambda x: x.clip(lower=x.mean() - 3 * x.std(), upper=x.mean() + 3 * x.std())
)

# Encoding categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns[:-1], drop_first=True)

# Standardize numerical columns for consistency across all classifiers
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

target_mapping = {0: "Dropout", 1: "Graduate"}
# Apply PCA to retain 95% of variance
X = data_encoded.drop(columns=["Target"])
y = data_encoded["Target"]
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)

## Benchmarking

In [9]:
classifiers = {
    "kNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVC": SVC(probability=True), #enable probability estimates for SVC

    "LinearDiscriminant": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminant": QuadraticDiscriminantAnalysis(),

    "GaussianNB": GaussianNB(),

    "LogisticRegression": LogisticRegression(multi_class="multinomial", max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "Bagging": BaggingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y) #for having same proportions of each class through folds (dataset unbalanced!)
results = []

tqdm_iter = tqdm(classifiers.items())
for clf_name, clf in tqdm_iter:

    if not hasattr(clf, "predict_proba"):
        clf = CalibratedClassifierCV(clf)
    
    probs = cross_val_predict(clf, X_test, y_test, cv=5, method='predict_proba')
    preds = np.argmax(probs, axis=1)
    max_probs = np.max(probs, axis=1)
    
    for i, (pred, prob) in enumerate(zip(preds, max_probs)):

        if len(results) <= i:

            results.append({})
        label = target_mapping[pred]
        results[i][f"{clf_name}_Label"] = label
        results[i][f"{clf_name}_ProbScore"] = prob


results_df = pd.DataFrame(results)

# Add true labels 
results_df["True_Label"] = list(y_test)

100%|██████████| 10/10 [00:01<00:00,  5.78it/s]


In [21]:
results_df.head()

Unnamed: 0,kNN_Label,kNN_ProbScore,DecisionTree_Label,DecisionTree_ProbScore,SVC_Label,SVC_ProbScore,LinearDiscriminant_Label,LinearDiscriminant_ProbScore,QuadraticDiscriminant_Label,QuadraticDiscriminant_ProbScore,...,GaussianNB_ProbScore,LogisticRegression_Label,LogisticRegression_ProbScore,RandomForest_Label,RandomForest_ProbScore,Bagging_Label,Bagging_ProbScore,AdaBoost_Label,AdaBoost_ProbScore,True_Label
0,Graduate,0.6,Graduate,1.0,Graduate,0.510454,Graduate,0.852367,Dropout,0.999081,...,0.993331,Graduate,0.662619,Dropout,0.57,Graduate,0.8,Graduate,0.529478,Dropout
1,Graduate,1.0,Graduate,1.0,Graduate,0.941331,Graduate,0.986785,Graduate,1.0,...,0.861029,Graduate,0.941335,Graduate,0.77,Graduate,0.7,Graduate,0.604695,Graduate
2,Graduate,1.0,Graduate,1.0,Graduate,0.922043,Graduate,0.990851,Graduate,0.999999,...,0.991078,Graduate,0.977961,Graduate,0.79,Graduate,0.9,Graduate,0.592476,Graduate
3,Graduate,0.8,Graduate,1.0,Graduate,0.929558,Graduate,0.980752,Graduate,0.998666,...,0.986587,Graduate,0.932972,Graduate,0.76,Graduate,1.0,Graduate,0.606583,Graduate
4,Graduate,0.6,Graduate,1.0,Graduate,0.97081,Graduate,0.995383,Graduate,1.0,...,0.997634,Graduate,0.983171,Graduate,0.92,Graduate,1.0,Graduate,0.60402,Graduate


In [22]:
results_df.to_csv("results_table.csv", index=False)

### Analysis

In [23]:
print(len(y_test))

726


In [7]:
label_counts = pd.Series(y_test).value_counts()

print(f"Graduate: {label_counts.get('Graduate', 0)}")
print(f"Dropout: {label_counts.get('Dropout', 0)}")

Graduate: 442
Dropout: 284


In [8]:
mismatch_counts = {}

for clf_name in classifiers.keys():

    mismatch_counts[clf_name] = results_df.apply(
        lambda row: row[f"{clf_name}_Label"] != row["True_Label"] 
        if pd.notna(row[f"{clf_name}_Label"]) else False, 
        axis=1
    ).sum()

for clf_name, count in mismatch_counts.items():
    
    print(f"{clf_name}: {count} mismatches")

kNN: 122 mismatches
DecisionTree: 160 mismatches
SVC: 94 mismatches
LinearDiscriminant: 88 mismatches
QuadraticDiscriminant: 108 mismatches
GaussianNB: 145 mismatches
LogisticRegression: 84 mismatches
RandomForest: 91 mismatches
Bagging: 105 mismatches
AdaBoost: 110 mismatches


In [58]:
#Results of individual classifiers
classifiers = {
    "kNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(),
    "SVC": SVC(probability=True), #enable probability estimates for SVC
    "NearestCentroid": NearestCentroid(),
    "LinearDiscriminant": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminant": QuadraticDiscriminantAnalysis(),

    "GaussianNB": GaussianNB(),
    "GradientBoosting": GradientBoostingClassifier(),
    "LogisticRegression": LogisticRegression(multi_class="multinomial", max_iter=500),
    "RandomForest": RandomForestClassifier(),
    "Bagging": BaggingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}
warnings.simplefilter('ignore')
for clf_name, clf in classifiers.items():
    scores = model_selection.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy for %s: %0.2f (+/- %0.2f)" 
          % (clf_name, scores.mean(), scores.std() ))

Accuracy for kNN: 0.86 (+/- 0.00)
Accuracy for DecisionTree: 0.85 (+/- 0.01)
Accuracy for SVC: 0.91 (+/- 0.01)
Accuracy for NearestCentroid: 0.86 (+/- 0.00)
Accuracy for LinearDiscriminant: 0.90 (+/- 0.01)
Accuracy for QuadraticDiscriminant: 0.85 (+/- 0.01)
Accuracy for GaussianNB: 0.82 (+/- 0.01)
Accuracy for GradientBoosting: 0.91 (+/- 0.00)
Accuracy for LogisticRegression: 0.91 (+/- 0.01)
Accuracy for RandomForest: 0.91 (+/- 0.00)
Accuracy for Bagging: 0.89 (+/- 0.01)
Accuracy for AdaBoost: 0.90 (+/- 0.01)


In [17]:
file_path = "data.csv"
data = pd.read_csv(file_path, delimiter=";")
data = data[data['Target'] != 'Enrolled'] #only considering 'Dropout' and 'Graduate'


# Identify numerical and categorical columns
numerical_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_columns = ["Marital status", "Daytime/evening attendance\t", "Gender", "Target"]

# Handle outliers in numerical columns using z-score (capping values beyond 3 standard deviations)
numerical_columns = [col for col in numerical_columns if col not in categorical_columns]
data[numerical_columns] = data[numerical_columns].apply(
    lambda x: x.clip(lower=x.mean() - 3 * x.std(), upper=x.mean() + 3 * x.std())
) #column-wise

# Encoding categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_columns[:-1], drop_first=True)

# Standardize numerical columns for consistency across all classifiers
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# Apply PCA to retain 95% of variance
X = data_encoded.drop(columns=["Target"])
y = data_encoded["Target"]
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y) #for having same proportions of each class through folds (dataset unbalanced!)

In [31]:
import itertools
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from tqdm import tqdm
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
warnings.filterwarnings("ignore", category=FutureWarning)

classifiers = {
    "LogisticRegression": LogisticRegression(),
    "RidgeClassifier": RidgeClassifier(), 

    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
    "ShrinkageLDA": LinearDiscriminantAnalysis(solver="lsqr", shrinkage="auto"), 
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "RegularizedQDA": QuadraticDiscriminantAnalysis(reg_param=0.5), #reg points? 

    "LinearSVC": LinearSVC(),
    "SVC_rbf": SVC(kernel='rbf', probability=True), 

    "HingeSGDClassifier": SGDClassifier(loss='hinge'),
    "LogLossSGDClassifier": SGDClassifier(loss='log_loss'),
    "PolynomialSGDClassifier": make_pipeline(PolynomialFeatures(), SGDClassifier()), 

    "GaussianNB": GaussianNB(),
    #"MultinomialNB": MultinomialNB(), different data scale --> different train/test sets
    #"ComplementNB": ComplementNB(), 
    #"CategoricalNB": CategoricalNB(), 

    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=1),
    "FullDecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),

    "KNeighborsClassifier": KNeighborsClassifier(),
    "BaggingClassifier": BaggingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier()
}
# Classifiers needing calibration due to unreliable probabilities
requires_calibration = {
    "DecisionTreeClassifier",
    "KNeighborsClassifier",
    "SVC_rbf",
    "AdaBoostClassifier",
    "BaggingClassifier"   
}

def indiv():
    results = {}  # Dictionary to store classifier results
    tqdm_iter = tqdm(classifiers.items(), desc="Processing Individual Classifiers")

    for clf_name, clf in tqdm_iter:
        if not hasattr(clf, "predict_proba") or clf_name in requires_calibration:
            clf = CalibratedClassifierCV(clf, method='sigmoid')
        
        scores = cross_val_score(clf, X_pca, y, cv=5, scoring='roc_auc')
        print("Accuracy for %s: %0.2f (+/- %0.2f)" 
              % (clf_name, scores.mean(), scores.std()))

        # Store results in the dictionary
        results[clf_name] = (scores.mean(), scores.std())
    
    return results


results_df = pd.DataFrame(results)
def combine():
    combinations = itertools.combinations(classifiers.items(), 2)
    total_combinations = len(list(itertools.combinations(classifiers.items(), 2)))
    tqdm_iter = tqdm(combinations, total=total_combinations, desc="Processing combinations")
    results = []

    for combo in tqdm_iter:
        base_classifiers = [(name, clf) for name, clf in combo]
        #used only logistic for now, can chec other meta-classifiers
        meta_classifier = LogisticRegression()

        stacking_clf = StackingClassifier(estimators=base_classifiers, final_estimator=meta_classifier)


        stacking_clf.fit(X_train, y_train)
    
        y_pred = stacking_clf.predict(X_test)
        scores = model_selection.cross_val_score(stacking_clf, X_pca, y, cv=5, scoring='roc_auc')
        accuracy = scores.mean()
        std_dev = scores.std()

        results.append((base_classifiers, accuracy, std_dev))
    return results

# Step 1: Calculate individual model accuracies
individual_accuracies = indiv()

# Step 2: Calculate combination model accuracies
combination_results = combine()

# Step 3: Build the comparison table
comparison_table = []

for base_classifiers, comb_accuracy, comb_std_dev in combination_results:
    clf1_name, clf2_name = [name for name, _ in base_classifiers]
    clf1_accuracy = individual_accuracies[clf1_name][0]
    clf2_accuracy = individual_accuracies[clf2_name][0]
    best_individual_accuracy = max(clf1_accuracy, clf2_accuracy)
    difference = comb_accuracy - best_individual_accuracy

    comparison_table.append({
        "Combined Models": f"{clf1_name} + {clf2_name}",
        "Individual Model 1": clf1_name,
        "Individual Model 2": clf2_name,
        "Combined Accuracy": comb_accuracy,
        "Best Individual Accuracy": best_individual_accuracy,
        "Difference": difference,
        "Std Dev (Combined)": comb_std_dev
    })

# Step 4: Create and display the DataFrame
results_df = pd.DataFrame(comparison_table)
print(results_df)

Processing Individual Classifiers:  11%|█         | 2/18 [00:00<00:02,  5.54it/s]

Accuracy for LogisticRegression: 0.94 (+/- 0.01)
Accuracy for RidgeClassifier: 0.94 (+/- 0.00)
Accuracy for LinearDiscriminantAnalysis: 0.94 (+/- 0.00)


Processing Individual Classifiers:  22%|██▏       | 4/18 [00:00<00:01,  9.28it/s]

Accuracy for ShrinkageLDA: 0.94 (+/- 0.00)
Accuracy for QuadraticDiscriminantAnalysis: 0.91 (+/- 0.00)
Accuracy for RegularizedQDA: 0.90 (+/- 0.01)


Processing Individual Classifiers:  39%|███▉      | 7/18 [00:00<00:01,  9.55it/s]

Accuracy for LinearSVC: 0.94 (+/- 0.01)


Processing Individual Classifiers:  44%|████▍     | 8/18 [00:07<00:15,  1.59s/it]

Accuracy for SVC_rbf: 0.95 (+/- 0.00)


Processing Individual Classifiers:  56%|█████▌    | 10/18 [00:08<00:07,  1.01it/s]

Accuracy for HingeSGDClassifier: 0.94 (+/- 0.01)
Accuracy for LogLossSGDClassifier: 0.93 (+/- 0.01)


Processing Individual Classifiers:  61%|██████    | 11/18 [00:09<00:08,  1.18s/it]

Accuracy for PolynomialSGDClassifier: 0.90 (+/- 0.01)
Accuracy for GaussianNB: 0.86 (+/- 0.01)


Processing Individual Classifiers:  72%|███████▏  | 13/18 [00:10<00:03,  1.35it/s]

Accuracy for DecisionTreeClassifier: 0.76 (+/- 0.01)


Processing Individual Classifiers:  78%|███████▊  | 14/18 [00:10<00:02,  1.53it/s]

Accuracy for FullDecisionTreeClassifier: 0.82 (+/- 0.02)


Processing Individual Classifiers:  83%|████████▎ | 15/18 [00:14<00:04,  1.35s/it]

Accuracy for RandomForestClassifier: 0.93 (+/- 0.00)


Processing Individual Classifiers:  89%|████████▉ | 16/18 [00:14<00:02,  1.07s/it]

Accuracy for KNeighborsClassifier: 0.91 (+/- 0.00)


Processing Individual Classifiers:  94%|█████████▍| 17/18 [00:21<00:02,  2.86s/it]

Accuracy for BaggingClassifier: 0.93 (+/- 0.01)


Processing Individual Classifiers: 100%|██████████| 18/18 [00:31<00:00,  1.76s/it]


Accuracy for AdaBoostClassifier: 0.93 (+/- 0.00)


Processing combinations: 100%|██████████| 153/153 [17:42<00:00,  6.94s/it]

                                       Combined Models  \
0                 LogisticRegression + RidgeClassifier   
1      LogisticRegression + LinearDiscriminantAnalysis   
2                    LogisticRegression + ShrinkageLDA   
3    LogisticRegression + QuadraticDiscriminantAnal...   
4                  LogisticRegression + RegularizedQDA   
..                                                 ...   
148         RandomForestClassifier + BaggingClassifier   
149        RandomForestClassifier + AdaBoostClassifier   
150           KNeighborsClassifier + BaggingClassifier   
151          KNeighborsClassifier + AdaBoostClassifier   
152             BaggingClassifier + AdaBoostClassifier   

         Individual Model 1             Individual Model 2  Combined Accuracy  \
0        LogisticRegression                RidgeClassifier           0.942194   
1        LogisticRegression     LinearDiscriminantAnalysis           0.941650   
2        LogisticRegression                   ShrinkageLDA  




In [33]:
results_df.to_csv("results_table_last.csv", index=False)

## Meta-Classifier

In [62]:
#some checkings
meta_classifier = LogisticRegression()
base_classifier1 = [
    ('SVC', classifiers["SVC"]),
    ('LogisticRegression', classifiers["LogisticRegression"]),
    ('RandomForest', classifiers["RandomForest"]),
    ]
base_classifier2 = [
    ('GradientBoostingClassifier', classifiers["GradientBoosting"]),
    ("kNN", classifiers["kNN"]),
    ("GaussianNB", classifiers["GaussianNB"]),
    ]
base_classifier3 = [
    ('SVC', classifiers["SVC"]),
    ("LinearDiscriminant", classifiers["LinearDiscriminant"]),
    ('AdaBoost', classifiers["AdaBoost"]),
]

stacking_clf = StackingClassifier(estimators=base_classifier3, final_estimator=meta_classifier)
# Train stacking classifier
stacking_clf.fit(X_train, y_train)

# Predict on test data
y_pred = stacking_clf.predict(X_test)

scores = model_selection.cross_val_score(stacking_clf , X, y, cv=5, scoring='accuracy')
print("Accuracy for Stacking Classifier: %0.2f (+/- %0.2f)" 
          % (scores.mean(), scores.std() ))

Accuracy for Stacking Classifier: 0.91 (+/- 0.01)
