In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import itertools
from matplotlib.ticker import ScalarFormatter
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [40]:
df_raw = pd.read_csv("dropout_rate.csv", sep=";", encoding="utf-8-sig")

In [41]:
df_raw.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd

### Chọn feature

In [42]:
ft_cols_qualitative = [
    'Tuition fees up to date',
    'Scholarship holder',
    'Debtor',
]

ft_cols_qualitative_to_one_hot = [
    
]

ft_cols_quantitative = [
    'Application order', 
    'Age at enrollment', 
    'Curricular units 1st sem (approved)', 
    'Curricular units 1st sem (grade)',
    'Curricular units 2nd sem (approved)', 
    'Curricular units 2nd sem (grade)',
]

target_col = 'Target'

In [43]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate

def run_kfold_pipeline(
        df_raw: pd.DataFrame,
        ft_cols_qualitative: list[str],
        ft_cols_qualitative_to_one_hot: list[str],
        ft_cols_quantitative: list[str],
        target_col: str,
        scaler,
        model,
        n_splits: int = 5,
        random_state: int = 42,
        shuffle: bool = True,
        scoring=None
    ):
    
    binary_flag_cols = list(set(ft_cols_qualitative) - set(ft_cols_qualitative_to_one_hot))

    preprocessor = ColumnTransformer(
        transformers=[
            ("num",    scaler,                                   ft_cols_quantitative),
            ("onehot", OneHotEncoder(handle_unknown="ignore"),   ft_cols_qualitative_to_one_hot),
            ("binary", "passthrough",                            binary_flag_cols)
        ],
        remainder="drop"
    )

    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    X = df_raw.drop(columns=[target_col])
    y = df_raw[target_col]

    cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    results = cross_validate(pipe, X, y, cv=cv, n_jobs=-1,
                             return_train_score=True, scoring=scoring)

    return results



In [44]:
df = df_raw[ft_cols_qualitative + ft_cols_quantitative + [target_col]].copy()

df = df[df['Curricular units 1st sem (approved)'] <= 11]
df = df[df['Curricular units 2nd sem (approved)'] <= 12]

df = df[df[target_col].isin(['Dropout', 'Graduate'])]
df[target_col] = [1 if x == 'Dropout' else 0 for x in df[target_col]]
df

Unnamed: 0,Tuition fees up to date,Scholarship holder,Debtor,Application order,Age at enrollment,Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Target
0,1,0,0,5,20,0,0.000000,0,0.000000,1
1,0,0,0,1,19,6,14.000000,6,13.666667,0
2,0,0,0,5,19,0,0.000000,0,0.000000,1
3,1,0,0,2,20,6,13.428571,5,12.400000,0
4,1,0,0,1,45,5,12.333333,6,13.000000,0
...,...,...,...,...,...,...,...,...,...,...
4419,1,0,0,6,19,5,13.600000,5,12.666667,0
4420,0,0,1,2,18,6,12.000000,2,11.000000,1
4421,1,1,0,1,30,7,14.912500,1,13.500000,1
4422,1,1,0,1,20,5,13.800000,5,12.000000,0


### LogisticRegression

In [45]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import MinMaxScaler

list_max_iter = [10, 20, 30, 40, 50, 100, 200, 300, 500]
list_cs = [1,2,3,4,5,8,10]
best_parameters = {
    "max_iter": None,
    "cs": None,
    "test_acc": -math.inf,
    "cv_scores": None
}

for max_iter, cs in tqdm(itertools.product(list_max_iter, list_cs), 
                        total=len(list_max_iter) * len(list_cv), 
                        desc="Running Logistic Regression CV"):
    cv_run = run_kfold_pipeline(
        df_raw=df,
        ft_cols_qualitative=ft_cols_qualitative,
        ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
        ft_cols_quantitative=ft_cols_quantitative,
        target_col=target_col,
        scaler=MinMaxScaler(),
        model=LogisticRegressionCV(
            max_iter=max_iter,
            Cs=cs,
        ),
        scoring = {
            "acc": "accuracy",
            "recall": "recall",
            "precision": "precision",
            "f1": "f1",
            "roc_auc": "roc_auc"
        }
    )
    
    mean_test_acc = cv_run["test_acc"].mean()
    
    if mean_test_acc > best_parameters["test_acc"]:
        best_parameters["max_iter"] = max_iter
        best_parameters["cs"] = cs
        best_parameters["test_acc"] = mean_test_acc
        best_parameters["cv_scores"] = cv_run

print(f"Best parameters: {best_parameters}")
cv_scores_logistic = best_parameters["cv_scores"]

pd.DataFrame(cv_scores_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]


Running Logistic Regression CV: 63it [00:06, 10.36it/s]                        

Best parameters: {'max_iter': 20, 'cs': 8, 'test_acc': np.float64(0.8887962251728301), 'cv_scores': {'fit_time': array([0.06957793, 0.07143378, 0.07138944, 0.07701898, 0.07434249]), 'score_time': array([0.01098275, 0.01063728, 0.00963163, 0.01358008, 0.01268888]), 'test_acc': array([0.88746439, 0.88176638, 0.89586305, 0.88873039, 0.89015692]), 'train_acc': array([0.88770053, 0.89019608, 0.88916607, 0.88916607, 0.88667142]), 'test_recall': array([0.80714286, 0.80357143, 0.85357143, 0.82078853, 0.80645161]), 'train_recall': array([0.82200358, 0.81932021, 0.8157424 , 0.82216265, 0.81590706]), 'test_precision': array([0.90039841, 0.88932806, 0.88191882, 0.89105058, 0.90725806]), 'train_precision': array([0.88792271, 0.8962818 , 0.89675516, 0.89147287, 0.89073171]), 'test_f1': array([0.85122411, 0.84427767, 0.86751361, 0.85447761, 0.85388994]), 'train_f1': array([0.85369252, 0.85607477, 0.85433255, 0.85541609, 0.8516791 ]), 'test_roc_auc': array([0.93471987, 0.92156398, 0.92408381, 0.931483




Unnamed: 0,fit_time,score_time,test_acc,train_acc,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1,test_roc_auc,train_roc_auc
mean,0.072753,0.011504,0.888796,0.88858,0.818305,0.819027,0.893991,0.892633,0.854277,0.854239,0.930795,0.931676
std,0.002933,0.0016,0.005074,0.001389,0.020809,0.003135,0.009915,0.00379,0.008437,0.001704,0.008284,0.002083
min,0.069578,0.009632,0.881766,0.886671,0.803571,0.815742,0.881919,0.887923,0.844278,0.851679,0.921564,0.928759
max,0.077019,0.01358,0.895863,0.890196,0.853571,0.822163,0.907258,0.896755,0.867514,0.856075,0.942126,0.934363


### RandomForestClassifier

In [46]:
from sklearn.ensemble import RandomForestClassifier

list_max_depth = [2, 3, 5, 10]
list_n_estimators = [50, 100, 150, 200]
list_min_samples_leaf = [3, 5, 10, 20]

best_parameters = {
    "max_depth": None,
    "n_estimators": None,
    "min_samples_leaf": None,
    "test_acc": -math.inf,
    "cv_scores": None
}

for max_depth, n_estimators, min_samples_leaf in tqdm(
    itertools.product(list_max_depth, list_n_estimators, list_min_samples_leaf),
    total=len(list_max_depth) * len(list_n_estimators) * len(list_min_samples_leaf)
):    
    cv_run = run_kfold_pipeline(
        df_raw=df,
        ft_cols_qualitative=ft_cols_qualitative,
        ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
        ft_cols_quantitative=ft_cols_quantitative,
        target_col=target_col,
        scaler=MinMaxScaler(),
        model=RandomForestClassifier(
            max_depth=max_depth,
            n_estimators=n_estimators,
            min_samples_leaf=min_samples_leaf,
        ),
        scoring = {
            "acc": "accuracy",
            "recall": "recall",
            "precision": "precision",
            "f1": "f1",
            "roc_auc": "roc_auc"
        }
    )

    mean_test_acc = cv_run['test_acc'].mean()
    if mean_test_acc > best_parameters["test_acc"]:
        best_parameters["max_depth"] = max_depth
        best_parameters["n_estimators"] = n_estimators
        best_parameters["min_samples_leaf"] = min_samples_leaf
        best_parameters["test_acc"] = mean_test_acc
        best_parameters["cv_scores"] = cv_run

print(f"Best parameters: {best_parameters}")

cv_scores_rf = best_parameters["cv_scores"]

pd.DataFrame(cv_scores_rf).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]

100%|██████████| 64/64 [00:18<00:00,  3.43it/s]

Best parameters: {'max_depth': 10, 'n_estimators': 100, 'min_samples_leaf': 3, 'test_acc': np.float64(0.8953570601216821), 'cv_scores': {'fit_time': array([0.22575474, 0.22626948, 0.20266294, 0.21911454, 0.21455455]), 'score_time': array([0.02668476, 0.03018045, 0.03081703, 0.02421737, 0.02877712]), 'test_acc': array([0.88461538, 0.88888889, 0.89443652, 0.90299572, 0.90584879]), 'train_acc': array([0.91871658, 0.92263815, 0.92409123, 0.92337847, 0.91803279]), 'test_recall': array([0.81785714, 0.80357143, 0.83571429, 0.83870968, 0.83154122]), 'train_recall': array([0.85957066, 0.85867621, 0.8667263 , 0.87042002, 0.85254692]), 'test_precision': array([0.88416988, 0.90725806, 0.89312977, 0.91050584, 0.92430279]), 'train_precision': array([0.93120155, 0.9421001 , 0.93804453, 0.93295019, 0.93621197]), 'test_f1': array([0.84972171, 0.85227273, 0.86346863, 0.87313433, 0.8754717 ]), 'train_f1': array([0.89395349, 0.89845578, 0.90097629, 0.90060102, 0.89242283]), 'test_roc_auc': array([0.936234




Unnamed: 0,fit_time,score_time,test_acc,train_acc,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1,test_roc_auc,train_roc_auc
mean,0.217671,0.028135,0.895357,0.921371,0.825479,0.861588,0.903873,0.936102,0.862814,0.897282,0.940097,0.97525
std,0.009699,0.002704,0.009035,0.002794,0.014617,0.007048,0.015624,0.004293,0.011723,0.003897,0.006587,0.000862
min,0.202663,0.024217,0.884615,0.918033,0.803571,0.852547,0.88417,0.931202,0.849722,0.892423,0.934673,0.9739
max,0.226269,0.030817,0.905849,0.924091,0.83871,0.87042,0.924303,0.9421,0.875472,0.900976,0.951367,0.976078


In [47]:
from xgboost import XGBClassifier

list_max_depth = [2, 3, 5, 8]
list_lambda = [2, 1, 0.1, 0.01]
list_n_estimators = [25, 50, 100, 150]


best_parameters = {
    "max_depth": None,
    "n_estimators": None,
    "reg_lambda": None,
    "test_acc": -math.inf,
    "cv_scores": None
}

for max_depth, n_estimators, reg_lambda in tqdm(
    itertools.product(list_max_depth, list_n_estimators, list_lambda),
    total=len(list_max_depth) * len(list_n_estimators) * len(list_lambda)
):
    cv_run = run_kfold_pipeline(
        df_raw=df,
        ft_cols_qualitative=ft_cols_qualitative,
        ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
        ft_cols_quantitative=ft_cols_quantitative,
        target_col=target_col,
        scaler=MinMaxScaler(),
        model=XGBClassifier(
            max_depth=max_depth,
            n_estimators=n_estimators,
            reg_lambda=reg_lambda,
        ),
        scoring = {
            "acc": "accuracy",
            "recall": "recall",
            "precision": "precision",
            "f1": "f1",
            "roc_auc": "roc_auc"
        }
    )

    mean_test_acc = cv_run['test_acc'].mean()
    if mean_test_acc > best_parameters["test_acc"]:
        best_parameters["max_depth"] = max_depth
        best_parameters["n_estimators"] = n_estimators
        best_parameters["reg_lambda"] = reg_lambda
        best_parameters["test_acc"] = mean_test_acc
        best_parameters["cv_scores"] = cv_run
        
print(f"Best parameters: {best_parameters}")

cv_scores_xgbc = best_parameters["cv_scores"]

pd.DataFrame(cv_scores_xgbc).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]

100%|██████████| 64/64 [00:06<00:00,  9.41it/s]

Best parameters: {'max_depth': 2, 'n_estimators': 50, 'reg_lambda': 1, 'test_acc': np.float64(0.8956407411471605), 'cv_scores': {'fit_time': array([0.02281046, 0.02181339, 0.01680923, 0.02181482, 0.02231884]), 'score_time': array([0.01530504, 0.01316309, 0.01416016, 0.01723981, 0.01573634]), 'test_acc': array([0.8988604 , 0.88034188, 0.89443652, 0.90442225, 0.90014265]), 'train_acc': array([0.90623886, 0.9030303 , 0.90449038, 0.9023521 , 0.90627227]), 'test_recall': array([0.83928571, 0.79642857, 0.83214286, 0.83512545, 0.83512545]), 'train_recall': array([0.84257603, 0.83989267, 0.84347048, 0.84450402, 0.83646113]), 'test_precision': array([0.90038314, 0.892     , 0.89615385, 0.91732283, 0.90661479]), 'train_precision': array([0.9154519 , 0.90988372, 0.91023166, 0.90430622, 0.92125984]), 'test_f1': array([0.86876155, 0.84150943, 0.86296296, 0.87429644, 0.86940299]), 'train_f1': array([0.87750349, 0.87348837, 0.87558032, 0.87338262, 0.87681499]), 'test_roc_auc': array([0.93791892, 0.92




Unnamed: 0,fit_time,score_time,test_acc,train_acc,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1,test_roc_auc,train_roc_auc
mean,0.021113,0.015121,0.895641,0.904477,0.827622,0.841381,0.902495,0.912227,0.863387,0.875354,0.938732,0.954659
std,0.002441,0.001554,0.009264,0.001798,0.017622,0.003239,0.009895,0.006407,0.012873,0.001882,0.006429,0.001281
min,0.016809,0.013163,0.880342,0.902352,0.796429,0.836461,0.892,0.904306,0.841509,0.873383,0.928432,0.952625
max,0.02281,0.01724,0.904422,0.906272,0.839286,0.844504,0.917323,0.92126,0.874296,0.877503,0.945731,0.956178
