In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import itertools
from matplotlib.ticker import ScalarFormatter
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [27]:
df_raw = pd.read_csv("dropout_rate.csv", sep=";", encoding="utf-8-sig")

In [28]:
df_raw.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd

### Chọn feature

In [29]:
ft_cols_qualitative = [
    'Tuition fees up to date',
    'Scholarship holder',
    'Debtor',
]

ft_cols_qualitative_to_one_hot = [
    
]

ft_cols_quantitative = [
    'Application order', 
    'Age at enrollment', 
    'Curricular units 1st sem (approved)', 
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)', 
    'Curricular units 2nd sem (grade)',
]

target_col = 'Target' ################

In [30]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate

def run_kfold_pipeline(
        df_raw: pd.DataFrame,
        ft_cols_qualitative: list[str],
        ft_cols_qualitative_to_one_hot: list[str],
        ft_cols_quantitative: list[str],
        target_col: str,
        scaler,
        model,
        n_splits: int = 5,
        random_state: int = 42,
        shuffle: bool = True,
        scoring=None
    ):
    
    binary_flag_cols = list(set(ft_cols_qualitative) - set(ft_cols_qualitative_to_one_hot))

    preprocessor = ColumnTransformer(
        transformers=[
            ("num",    scaler,                                   ft_cols_quantitative),
            ("onehot", OneHotEncoder(handle_unknown="ignore"),   ft_cols_qualitative_to_one_hot),
            ("binary", "passthrough",                            binary_flag_cols)
        ],
        remainder="drop"
    )

    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    X = df_raw.drop(columns=[target_col])
    y = df_raw[target_col]

    cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    results = cross_validate(pipe, X, y, cv=cv, n_jobs=-1,
                             return_train_score=True, scoring=scoring)

    return results



In [31]:
df = df_raw[ft_cols_qualitative + ft_cols_quantitative + [target_col]].copy()

df = df[df['Curricular units 1st sem (approved)'] <= 11]
df = df[df['Curricular units 2nd sem (approved)'] <= 12]

df = df[df[target_col].isin(['Dropout', 'Graduate'])]
df[target_col] = [1 if x == 'Dropout' else 0 for x in df[target_col]]
df

Unnamed: 0,Tuition fees up to date,Scholarship holder,Debtor,Application order,Age at enrollment,Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Target
0,1,0,0,5,20,0,0.000000,0,0.000000,1
1,0,0,0,1,19,6,14.000000,6,13.666667,0
2,0,0,0,5,19,0,0.000000,0,0.000000,1
3,1,0,0,2,20,6,13.428571,5,12.400000,0
4,1,0,0,1,45,5,12.333333,6,13.000000,0
...,...,...,...,...,...,...,...,...,...,...
4419,1,0,0,6,19,5,13.600000,5,12.666667,0
4420,0,0,1,2,18,6,12.000000,2,11.000000,1
4421,1,1,0,1,30,7,14.912500,1,13.500000,1
4422,1,1,0,1,20,5,13.800000,5,12.000000,0


### LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import RobustScaler

list_max_iter = [10, 20, 30, 40, 50, 100, 200, 300, 500]
list_cs = [1,2,3,4,5,8,10]
best_parameters = {
    "max_iter": None,
    "cs": None,
    "test_acc": -math.inf,
    "cv_scores": None
}

for max_iter, cs in tqdm(itertools.product(list_max_iter, list_cs), 
                        total=len(list_max_iter) * len(list_cs), 
                        desc="Running Logistic Regression CV"):
    cv_run = run_kfold_pipeline(
        df_raw=df,
        ft_cols_qualitative=ft_cols_qualitative,
        ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
        ft_cols_quantitative=ft_cols_quantitative,
        target_col=target_col,
        scaler=RobustScaler(),
        model=LogisticRegressionCV(
            max_iter=max_iter,
            Cs=cs,
        ),
        scoring = {
            "acc": "accuracy",
            "recall": "recall",
            "precision": "precision",
            "f1": "f1",
            "roc_auc": "roc_auc"
        }
    )
    
    mean_test_acc = cv_run["test_acc"].mean()
    
    if mean_test_acc > best_parameters["test_acc"]:
        best_parameters["max_iter"] = max_iter
        best_parameters["cs"] = cs
        best_parameters["test_acc"] = mean_test_acc
        best_parameters["cv_scores"] = cv_run

print(f"Best parameters: {best_parameters}")
cv_scores_logistic = best_parameters["cv_scores"]

pd.DataFrame(cv_scores_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]


Running Logistic Regression CV: 100%|██████████| 63/63 [00:13<00:00,  4.60it/s]

Best parameters: {'max_iter': 10, 'cs': 2, 'test_acc': np.float64(0.8890815318775376), 'cv_scores': {'fit_time': array([0.04518175, 0.04668808, 0.04417396, 0.05169487, 0.04064918]), 'score_time': array([0.0115428 , 0.01102901, 0.01154208, 0.01153755, 0.01003838]), 'test_acc': array([0.88746439, 0.88176638, 0.89443652, 0.88873039, 0.89300999]), 'train_acc': array([0.88983957, 0.8912656 , 0.89059159, 0.88809694, 0.88987883]), 'test_recall': array([0.80357143, 0.80357143, 0.85      , 0.82078853, 0.81362007]), 'train_recall': array([0.82110912, 0.82647585, 0.81842576, 0.82126899, 0.82126899]), 'test_precision': array([0.90361446, 0.88932806, 0.88148148, 0.89105058, 0.908     ]), 'train_precision': array([0.89386563, 0.89275362, 0.89793916, 0.88964182, 0.89396887]), 'test_f1': array([0.85066163, 0.84427767, 0.86545455, 0.85447761, 0.85822306]), 'train_f1': array([0.85594406, 0.8583372 , 0.85634066, 0.85408922, 0.85607825]), 'test_roc_auc': array([0.93474526, 0.92164015, 0.92329488, 0.931500




Unnamed: 0,fit_time,score_time,test_acc,train_acc,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1,test_roc_auc,train_roc_auc
mean,0.045678,0.011138,0.889082,0.889935,0.81831,0.82171,0.894695,0.893634,0.854619,0.856158,0.930804,0.931603
std,0.004032,0.000653,0.005009,0.001182,0.01915,0.002926,0.010877,0.002975,0.007953,0.001509,0.008678,0.002244
min,0.040649,0.010038,0.881766,0.888097,0.803571,0.818426,0.881481,0.889642,0.844278,0.854089,0.92164,0.928515
max,0.051695,0.011543,0.894437,0.891266,0.85,0.826476,0.908,0.897939,0.865455,0.858337,0.942839,0.934335


In [57]:
print(
    pd.DataFrame(cv_scores_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace("_", "\\_")
)

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.889935 & 0.889082 & 0.818310 & 0.894695 & 0.854619 & 0.930804 \\
std & 0.001182 & 0.005009 & 0.019150 & 0.010877 & 0.007953 & 0.008678 \\
min & 0.888097 & 0.881766 & 0.803571 & 0.881481 & 0.844278 & 0.921640 \\
max & 0.891266 & 0.894437 & 0.850000 & 0.908000 & 0.865455 & 0.942839 \\
\bottomrule
\end{tabular}



### RandomForestClassifier

In [33]:
from sklearn.ensemble import RandomForestClassifier

list_max_depth = [2, 3, 5, 10]
list_n_estimators = [50, 100, 150, 200]
list_min_samples_leaf = [3, 5, 10, 20]

best_parameters = {
    "max_depth": None,
    "n_estimators": None,
    "min_samples_leaf": None,
    "test_acc": -math.inf,
    "cv_scores": None
}

for max_depth, n_estimators, min_samples_leaf in tqdm(
    itertools.product(list_max_depth, list_n_estimators, list_min_samples_leaf),
    total=len(list_max_depth) * len(list_n_estimators) * len(list_min_samples_leaf)
):    
    cv_run = run_kfold_pipeline(
        df_raw=df,
        ft_cols_qualitative=ft_cols_qualitative,
        ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
        ft_cols_quantitative=ft_cols_quantitative,
        target_col=target_col,
        scaler=RobustScaler(),
        model=RandomForestClassifier(
            max_depth=max_depth,
            n_estimators=n_estimators,
            min_samples_leaf=min_samples_leaf,
        ),
        scoring = {
            "acc": "accuracy",
            "recall": "recall",
            "precision": "precision",
            "f1": "f1",
            "roc_auc": "roc_auc"
        }
    )

    mean_test_acc = cv_run['test_acc'].mean()
    if mean_test_acc > best_parameters["test_acc"]:
        best_parameters["max_depth"] = max_depth
        best_parameters["n_estimators"] = n_estimators
        best_parameters["min_samples_leaf"] = min_samples_leaf
        best_parameters["test_acc"] = mean_test_acc
        best_parameters["cv_scores"] = cv_run

print(f"Best parameters: {best_parameters}")

cv_scores_rf = best_parameters["cv_scores"]

pd.DataFrame(cv_scores_rf).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]

100%|██████████| 64/64 [00:17<00:00,  3.56it/s]

Best parameters: {'max_depth': 10, 'n_estimators': 100, 'min_samples_leaf': 3, 'test_acc': np.float64(0.8950705341575528), 'cv_scores': {'fit_time': array([0.22209764, 0.23015237, 0.22209764, 0.23015237, 0.26527524]), 'score_time': array([0.0324533 , 0.03311825, 0.03295922, 0.0264082 , 0.03402877]), 'test_acc': array([0.89173789, 0.88603989, 0.89015692, 0.90014265, 0.90727532]), 'train_acc': array([0.9201426 , 0.92192513, 0.92551675, 0.92195296, 0.91874555]), 'test_recall': array([0.82857143, 0.8       , 0.82857143, 0.83870968, 0.82795699]), 'train_recall': array([0.86314848, 0.85778175, 0.87030411, 0.86773905, 0.85344057]), 'test_precision': array([0.89230769, 0.90322581, 0.88888889, 0.9034749 , 0.93145161]), 'train_precision': array([0.93146718, 0.94111874, 0.93828351, 0.9318618 , 0.93719333]), 'test_f1': array([0.85925926, 0.84848485, 0.85767098, 0.86988848, 0.87666034]), 'train_f1': array([0.89600743, 0.89751989, 0.90301624, 0.89865803, 0.89335828]), 'test_roc_auc': array([0.937931




Unnamed: 0,fit_time,score_time,test_acc,train_acc,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1,test_roc_auc,train_roc_auc
mean,0.233955,0.031794,0.895071,0.921657,0.824762,0.862483,0.90387,0.935985,0.862393,0.897712,0.940038,0.974831
std,0.017966,0.003064,0.008536,0.002541,0.014551,0.006948,0.016729,0.004199,0.011015,0.003568,0.006931,0.000777
min,0.222098,0.026408,0.88604,0.918746,0.8,0.853441,0.888889,0.931467,0.848485,0.893358,0.933446,0.973783
max,0.265275,0.034029,0.907275,0.925517,0.83871,0.870304,0.931452,0.941119,0.87666,0.903016,0.95163,0.975809


In [56]:
print(
    pd.DataFrame(cv_scores_rf).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace("_", "\\_")
)

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.921657 & 0.895071 & 0.824762 & 0.903870 & 0.862393 & 0.940038 \\
std & 0.002541 & 0.008536 & 0.014551 & 0.016729 & 0.011015 & 0.006931 \\
min & 0.918746 & 0.886040 & 0.800000 & 0.888889 & 0.848485 & 0.933446 \\
max & 0.925517 & 0.907275 & 0.838710 & 0.931452 & 0.876660 & 0.951630 \\
\bottomrule
\end{tabular}



In [50]:
from xgboost import XGBClassifier

list_max_depth = [2, 3, 4, 5]
list_lambda = [3, 2, 1, 0.5]
list_learning_rate = [0.5, 0.25, 0.1, 0.05]
list_n_estimators = [25, 50, 100, 150]


best_parameters = {
    "max_depth": None,
    "n_estimators": None,
    "reg_lambda": None,
    "learning_rate": None,
    "test_acc": -math.inf,
    "cv_scores": None
}

for max_depth, n_estimators, reg_lambda, lr in tqdm(
    itertools.product(list_max_depth, list_n_estimators, list_lambda, list_learning_rate),
    total=len(list_max_depth) * len(list_n_estimators) * len(list_lambda) * len(list_learning_rate)
):
    cv_run = run_kfold_pipeline(
        df_raw=df,
        ft_cols_qualitative=ft_cols_qualitative,
        ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
        ft_cols_quantitative=ft_cols_quantitative,
        target_col=target_col,
        scaler=RobustScaler(),
        model=XGBClassifier(
            max_depth=max_depth,
            n_estimators=n_estimators,
            reg_lambda=reg_lambda,
            learning_rate=lr,
        ),
        scoring = {
            "acc": "accuracy",
            "recall": "recall",
            "precision": "precision",
            "f1": "f1",
            "roc_auc": "roc_auc"
        }
    )

    mean_test_acc = cv_run['test_acc'].mean()
    if mean_test_acc > best_parameters["test_acc"]:
        best_parameters["max_depth"] = max_depth
        best_parameters["n_estimators"] = n_estimators
        best_parameters["reg_lambda"] = reg_lambda
        best_parameters["learning_rate"] = lr
        best_parameters["test_acc"] = mean_test_acc
        best_parameters["cv_scores"] = cv_run
        
print(f"Best parameters: {best_parameters}")

cv_scores_xgbc = best_parameters["cv_scores"]

pd.DataFrame(cv_scores_xgbc).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]

100%|██████████| 256/256 [00:27<00:00,  9.38it/s]

Best parameters: {'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 3, 'learning_rate': 0.1, 'test_acc': np.float64(0.8953546216028384), 'cv_scores': {'fit_time': array([0.03284311, 0.03184509, 0.03236032, 0.03127813, 0.03127813]), 'score_time': array([0.01297283, 0.01196551, 0.01245761, 0.01203346, 0.01203346]), 'test_acc': array([0.8988604 , 0.88319088, 0.88873039, 0.90156919, 0.90442225]), 'train_acc': array([0.9144385 , 0.91907308, 0.92195296, 0.91625089, 0.91339986]), 'test_recall': array([0.83928571, 0.79642857, 0.82857143, 0.82437276, 0.82795699]), 'train_recall': array([0.85509839, 0.85778175, 0.86135957, 0.85969616, 0.84361037]), 'test_precision': array([0.90038314, 0.89919355, 0.88549618, 0.92      , 0.924     ]), 'train_precision': array([0.9245648 , 0.93378773, 0.93768257, 0.925     , 0.93280632]), 'test_f1': array([0.86876155, 0.84469697, 0.85608856, 0.86956522, 0.87334594]), 'train_f1': array([0.88847584, 0.89417249, 0.8979021 , 0.89115331, 0.88596903]), 'test_roc_auc': 




Unnamed: 0,fit_time,score_time,test_acc,train_acc,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1,test_roc_auc,train_roc_auc
mean,0.031921,0.012293,0.895355,0.917023,0.823323,0.855509,0.905815,0.930768,0.862492,0.891535,0.937652,0.966414
std,0.000685,0.000428,0.009013,0.003497,0.016033,0.007048,0.015954,0.005763,0.011883,0.00469,0.006926,0.001905
min,0.031278,0.011966,0.883191,0.9134,0.796429,0.84361,0.885496,0.924565,0.844697,0.885969,0.92894,0.963252
max,0.032843,0.012973,0.904422,0.921953,0.839286,0.86136,0.924,0.937683,0.873346,0.897902,0.947808,0.96814


In [55]:
print(
    pd.DataFrame(cv_scores_xgbc).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace("_", "\\_")
)

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.917023 & 0.895355 & 0.823323 & 0.905815 & 0.862492 & 0.937652 \\
std & 0.003497 & 0.009013 & 0.016033 & 0.015954 & 0.011883 & 0.006926 \\
min & 0.913400 & 0.883191 & 0.796429 & 0.885496 & 0.844697 & 0.928940 \\
max & 0.921953 & 0.904422 & 0.839286 & 0.924000 & 0.873346 & 0.947808 \\
\bottomrule
\end{tabular}



In [40]:
# cv_scores_logistic cv_scores_rf cv_scores_xgbc
# create comparison dataframe using mean test accuracy, mean f1, mean recall, mean precision, mean roc_auc
comparison_df = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "XGBoost"],
    "Mean Test Accuracy": [
        cv_scores_logistic["test_acc"].mean(),
        cv_scores_rf["test_acc"].mean(),
        cv_scores_xgbc["test_acc"].mean()
    ],
    "Mean F1": [
        cv_scores_logistic["test_f1"].mean(),
        cv_scores_rf["test_f1"].mean(),
        cv_scores_xgbc["test_f1"].mean()
    ],
    "Mean Recall": [
        cv_scores_logistic["test_recall"].mean(),
        cv_scores_rf["test_recall"].mean(),
        cv_scores_xgbc["test_recall"].mean()
    ],
    "Mean Precision": [
        cv_scores_logistic["test_precision"].mean(),
        cv_scores_rf["test_precision"].mean(),
        cv_scores_xgbc["test_precision"].mean()
    ],
    "Mean ROC AUC": [
        cv_scores_logistic["test_roc_auc"].mean(),
        cv_scores_rf["test_roc_auc"].mean(),
        cv_scores_xgbc["test_roc_auc"].mean()
    ]
})

comparison_df

Unnamed: 0,Model,Mean Test Accuracy,Mean F1,Mean Recall,Mean Precision,Mean ROC AUC
0,Logistic Regression,0.889082,0.854619,0.81831,0.894695,0.930804
1,Random Forest,0.895071,0.862393,0.824762,0.90387,0.940038
2,XGBoost,0.895355,0.862492,0.823323,0.905815,0.937652


In [54]:
print(comparison_df.to_latex().replace("_", "\\_"))

\begin{tabular}{llrrrrr}
\toprule
 & Model & Mean Test Accuracy & Mean F1 & Mean Recall & Mean Precision & Mean ROC AUC \\
\midrule
0 & Logistic Regression & 0.889082 & 0.854619 & 0.818310 & 0.894695 & 0.930804 \\
1 & Random Forest & 0.895071 & 0.862393 & 0.824762 & 0.903870 & 0.940038 \\
2 & XGBoost & 0.895355 & 0.862492 & 0.823323 & 0.905815 & 0.937652 \\
\bottomrule
\end{tabular}

