In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import itertools
from tqdm import tqdm

In [3]:
df_raw = pd.read_csv('personality_datasert.csv')
df_raw

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,3.0,Yes,2.0,0.0,Introvert


In [4]:
# check for missing values
missing_value = df_raw.isnull().sum()
missing_value = missing_value[missing_value > 0]
missing_value

Series([], dtype: int64)

In [5]:
df_raw['Stage_fear'] = df_raw['Stage_fear'].map({'Yes': 1, 'No': 0})
df_raw['Drained_after_socializing'] = df_raw['Drained_after_socializing'].map({'Yes': 1, 'No': 0})
df_raw['Personality'] = df_raw['Personality'].map({'Introvert': 0, 'Extrovert':1})  

In [6]:
df_raw.head(2)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,1
1,9.0,1,0.0,0.0,1,0.0,3.0,0


In [7]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler

def run_kfold_pipeline(
        df_raw: pd.DataFrame,
        ft_cols_qualitative: list[str],
        ft_cols_qualitative_to_one_hot: list[str],
        ft_cols_quantitative: list[str],
        target_col: str,
        scaler,
        model,
        n_splits: int = 5,
        random_state: int = 42,
        shuffle: bool = True,
        scoring=None
    ):
    
    binary_flag_cols = list(set(ft_cols_qualitative) - set(ft_cols_qualitative_to_one_hot))

    preprocessor = ColumnTransformer(
        transformers=[
            ("num",    scaler,                                   ft_cols_quantitative),
            ("onehot", OneHotEncoder(handle_unknown="ignore"),   ft_cols_qualitative_to_one_hot),
            ("binary", "passthrough",                            binary_flag_cols)
        ],
        remainder="drop"
    )

    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    X = df_raw.drop(columns=[target_col])
    y = df_raw[target_col]

    cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    results = cross_validate(pipe, X, y, cv=cv, n_jobs=-1,
                             return_train_score=True, scoring=scoring)

    return results



In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def run_exp(df_input, ft_cols_qualitative, ft_cols_qualitative_to_one_hot, ft_cols_quantitative, target_col):
    df_run = df_input[ft_cols_qualitative + ft_cols_quantitative + [target_col]].copy()

    list_max_iter = [10, 20, 30, 40, 50, 100, 200, 300, 500]
    list_cs = [1,2,3,4,5,8,10]
    best_parameters = {
        "max_iter": None,
        "cs": None,
        "test_acc": -math.inf,
        "cv_scores": None
    }

    for max_iter, cs in tqdm(itertools.product(list_max_iter, list_cs), 
                            total=len(list_max_iter) * len(list_cs), 
                            desc="Running Logistic Regression CV"):
        cv_run = run_kfold_pipeline(
            df_raw=df_run,
            ft_cols_qualitative=ft_cols_qualitative,
            ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
            ft_cols_quantitative=ft_cols_quantitative,
            target_col=target_col,
            scaler=StandardScaler(),
            model=LogisticRegressionCV(
                max_iter=max_iter,
                Cs=cs,
            ),
            scoring = {
                "acc": "accuracy",
                "recall": "recall",
                "precision": "precision",
                "f1": "f1",
                "roc_auc": "roc_auc"
            }
        )
        
        mean_test_acc = cv_run["test_acc"].mean()
        
        if mean_test_acc > best_parameters["test_acc"]:
            best_parameters["max_iter"] = max_iter
            best_parameters["cs"] = cs
            best_parameters["test_acc"] = mean_test_acc
            best_parameters["cv_scores"] = cv_run

    print(f"Best parameters LogClass: {best_parameters}")
    cv_scores_logistic = best_parameters["cv_scores"]

    list_max_depth = [2, 3, 5, 10]
    list_n_estimators = [50, 100, 150, 200]
    list_min_samples_leaf = [3, 5, 10, 20]

    best_parameters = {
        "max_depth": None,
        "n_estimators": None,
        "min_samples_leaf": None,
        "test_acc": -math.inf,
        "cv_scores": None
    }

    for max_depth, n_estimators, min_samples_leaf in tqdm(
        itertools.product(list_max_depth, list_n_estimators, list_min_samples_leaf),
        total=len(list_max_depth) * len(list_n_estimators) * len(list_min_samples_leaf)
    ):    
        cv_run = run_kfold_pipeline(
            df_raw=df_run,
            ft_cols_qualitative=ft_cols_qualitative,
            ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
            ft_cols_quantitative=ft_cols_quantitative,
            target_col=target_col,
            scaler=StandardScaler(),
            model=RandomForestClassifier(
                max_depth=max_depth,
                n_estimators=n_estimators,
                min_samples_leaf=min_samples_leaf,
            ),
            scoring = {
                "acc": "accuracy",
                "recall": "recall",
                "precision": "precision",
                "f1": "f1",
                "roc_auc": "roc_auc"
            }
        )

        mean_test_acc = cv_run['test_acc'].mean()
        if mean_test_acc > best_parameters["test_acc"]:
            best_parameters["max_depth"] = max_depth
            best_parameters["n_estimators"] = n_estimators
            best_parameters["min_samples_leaf"] = min_samples_leaf
            best_parameters["test_acc"] = mean_test_acc
            best_parameters["cv_scores"] = cv_run

    print(f"Best parameters: {best_parameters}")

    cv_scores_rf = best_parameters["cv_scores"]

    list_max_depth = [ 6, 7, 8, 9]
    list_lambda = [0.5, 1, 2]
    list_learning_rate = [1, 0.8, 0.5, 0.25]
    list_n_estimators = [50, 100, 150, 200]
    list_alpha = [0.5, 1, 2]

    best_parameters = {
        "max_depth": None,
        "n_estimators": None,
        "reg_lambda": None,
        "reg_aplha": None,
        "learning_rate": None,
        "test_acc": -math.inf,
        "cv_scores": None
    }

    for max_depth, n_estimators, reg_lambda, lr, alpha in tqdm(
        itertools.product(list_max_depth, list_n_estimators, list_lambda, list_learning_rate, list_alpha),
        total=len(list_max_depth) * len(list_n_estimators) * len(list_lambda) * len(list_learning_rate) * len(list_alpha)
    ):
        cv_run = run_kfold_pipeline(
            df_raw=df_run,
            ft_cols_qualitative=ft_cols_qualitative,
            ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
            ft_cols_quantitative=ft_cols_quantitative,
            target_col=target_col,
            scaler=StandardScaler(),
            model=XGBClassifier(
                max_depth=max_depth,
                n_estimators=n_estimators,
                reg_lambda=reg_lambda,
                learning_rate=lr,
                reg_alpha=alpha,
            ),
            scoring = {
                "acc": "accuracy",
                "recall": "recall",
                "precision": "precision",
                "f1": "f1",
                "roc_auc": "roc_auc"
            }
        )

        mean_test_acc = cv_run['test_acc'].mean()
        if mean_test_acc > best_parameters["test_acc"]:
            best_parameters["max_depth"] = max_depth
            best_parameters["n_estimators"] = n_estimators
            best_parameters["reg_lambda"] = reg_lambda
            best_parameters["reg_aplha"] = alpha
            best_parameters["learning_rate"] = lr
            best_parameters["test_acc"] = mean_test_acc
            best_parameters["cv_scores"] = cv_run
            
    print(f"Best parameters XGB: {best_parameters}")

    cv_scores_xgbc = best_parameters["cv_scores"]

    comparison_df = pd.DataFrame({
        "Model": ["Logistic Regression", "Random Forest", "XGBoost"],
        "Mean Test Accuracy": [
            cv_scores_logistic["test_acc"].mean(),
            cv_scores_rf["test_acc"].mean(),
            cv_scores_xgbc["test_acc"].mean()
        ],
        "Mean F1": [
            cv_scores_logistic["test_f1"].mean(),
            cv_scores_rf["test_f1"].mean(),
            cv_scores_xgbc["test_f1"].mean()
        ],
        "Mean Recall": [
            cv_scores_logistic["test_recall"].mean(),
            cv_scores_rf["test_recall"].mean(),
            cv_scores_xgbc["test_recall"].mean()
        ],
        "Mean Precision": [
            cv_scores_logistic["test_precision"].mean(),
            cv_scores_rf["test_precision"].mean(),
            cv_scores_xgbc["test_precision"].mean()
        ],
        "Mean ROC AUC": [
            cv_scores_logistic["test_roc_auc"].mean(),
            cv_scores_rf["test_roc_auc"].mean(),
            cv_scores_xgbc["test_roc_auc"].mean()
        ]
    })

    return comparison_df, cv_scores_logistic, cv_scores_rf, cv_scores_xgbc

In [9]:
raise KeyError("Stop")

KeyError: 'Stop'

In [None]:

ft_cols_qualitative = [
    'Stage_fear',
    'Drained_after_socializing',
]

ft_cols_qualitative_to_one_hot = [
    
]

ft_cols_quantitative = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency'
]

target_col = 'Personality' ################

comparison_df, cv_scores_logistic, cv_scores_rf, cv_scores_xgbc = run_exp(df_raw, ft_cols_qualitative, ft_cols_qualitative_to_one_hot, ft_cols_quantitative, target_col)
comparison_df

Running Logistic Regression CV: 100%|██████████| 63/63 [00:19<00:00,  3.19it/s]


Best parameters LogClass: {'max_iter': 10, 'cs': 1, 'test_acc': np.float64(0.9344827586206896), 'cv_scores': {'fit_time': array([0.02661085, 0.02761102, 0.02928615, 0.02710032, 0.03138924]), 'score_time': array([0.01466274, 0.01569319, 0.0185225 , 0.01269317, 0.01293302]), 'test_acc': array([0.9362069 , 0.92241379, 0.9362069 , 0.92586207, 0.95172414]), 'train_acc': array([0.93405172, 0.9375    , 0.93405172, 0.93663793, 0.93017241]), 'test_recall': array([0.92307692, 0.90939597, 0.91275168, 0.9261745 , 0.95637584]), 'train_recall': array([0.9261745 , 0.92958927, 0.92875105, 0.92539816, 0.91785415]), 'test_precision': array([0.95172414, 0.93771626, 0.96113074, 0.92929293, 0.95      ]), 'train_precision': array([0.94439692, 0.94786325, 0.94217687, 0.95008606, 0.94477998]), 'test_f1': array([0.93718166, 0.92333901, 0.9363167 , 0.92773109, 0.95317726]), 'train_f1': array([0.93519695, 0.93863733, 0.93541579, 0.93757962, 0.93112245]), 'test_roc_auc': array([0.91422773, 0.88958899, 0.91702366,

100%|██████████| 64/64 [00:15<00:00,  4.20it/s]


Best parameters: {'max_depth': 2, 'n_estimators': 50, 'min_samples_leaf': 3, 'test_acc': np.float64(0.9344827586206896), 'cv_scores': {'fit_time': array([0.05013967, 0.05013967, 0.05014014, 0.05715823, 0.05715823]), 'score_time': array([0.0150373 , 0.01604414, 0.01604509, 0.01758099, 0.01758099]), 'test_acc': array([0.9362069 , 0.92241379, 0.9362069 , 0.92586207, 0.95172414]), 'train_acc': array([0.93405172, 0.9375    , 0.93405172, 0.93663793, 0.93017241]), 'test_recall': array([0.92307692, 0.90939597, 0.91275168, 0.9261745 , 0.95637584]), 'train_recall': array([0.9261745 , 0.92958927, 0.92875105, 0.92539816, 0.91785415]), 'test_precision': array([0.95172414, 0.93771626, 0.96113074, 0.92929293, 0.95      ]), 'train_precision': array([0.94439692, 0.94786325, 0.94217687, 0.95008606, 0.94477998]), 'test_f1': array([0.93718166, 0.92333901, 0.9363167 , 0.92773109, 0.95317726]), 'train_f1': array([0.93519695, 0.93863733, 0.93541579, 0.93757962, 0.93112245]), 'test_roc_auc': array([0.95712874

100%|██████████| 576/576 [01:04<00:00,  8.95it/s]

Best parameters XGB: {'max_depth': 6, 'n_estimators': 100, 'reg_lambda': 0.5, 'reg_aplha': 2, 'learning_rate': 0.25, 'test_acc': np.float64(0.9331034482758621), 'cv_scores': {'fit_time': array([0.03658962, 0.02611709, 0.02511716, 0.02511716, 0.03552842]), 'score_time': array([0.01979804, 0.01478386, 0.01478386, 0.01377201, 0.01970291]), 'test_acc': array([0.93965517, 0.92068966, 0.9362069 , 0.92241379, 0.94655172]), 'train_acc': array([0.93534483, 0.93922414, 0.93663793, 0.93836207, 0.93491379]), 'test_recall': array([0.9264214 , 0.90604027, 0.91275168, 0.91946309, 0.95302013]), 'train_recall': array([0.9261745 , 0.9270746 , 0.92875105, 0.92539816, 0.9195306 ]), 'test_precision': array([0.95517241, 0.9375    , 0.96113074, 0.92881356, 0.94352159]), 'train_precision': array([0.94682676, 0.95344828, 0.94700855, 0.95336788, 0.95225694]), 'test_f1': array([0.94057725, 0.92150171, 0.9363167 , 0.92411467, 0.94824708]), 'train_f1': array([0.93638677, 0.9400765 , 0.93779094, 0.93917482, 0.93560




Unnamed: 0,Model,Mean Test Accuracy,Mean F1,Mean Recall,Mean Precision,Mean ROC AUC
0,Logistic Regression,0.934483,0.935549,0.925555,0.945973,0.90968
1,Random Forest,0.934483,0.935549,0.925555,0.945973,0.959291
2,XGBoost,0.933103,0.934151,0.923539,0.945228,0.962129


In [None]:
pd.DataFrame(cv_scores_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]

Unnamed: 0,fit_time,score_time,test_acc,train_acc,test_recall,train_recall,test_precision,train_precision,test_f1,train_f1,test_roc_auc,train_roc_auc
mean,0.0284,0.014901,0.934483,0.934483,0.925555,0.925553,0.945973,0.945861,0.935549,0.93559,0.90968,0.910142
std,0.001952,0.002374,0.011437,0.002859,0.018584,0.004642,0.012504,0.003112,0.011448,0.002889,0.021048,0.005243
min,0.026611,0.012693,0.922414,0.930172,0.909396,0.917854,0.929293,0.942177,0.923339,0.931122,0.888679,0.902891
max,0.031389,0.018523,0.951724,0.9375,0.956376,0.929589,0.961131,0.950086,0.953177,0.938637,0.938883,0.915411


In [None]:
print(pd.DataFrame(cv_scores_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.934483 & 0.934483 & 0.925555 & 0.945973 & 0.935549 & 0.909680 \\
std & 0.002859 & 0.011437 & 0.018584 & 0.012504 & 0.011448 & 0.021048 \\
min & 0.930172 & 0.922414 & 0.909396 & 0.929293 & 0.923339 & 0.888679 \\
max & 0.937500 & 0.951724 & 0.956376 & 0.961131 & 0.953177 & 0.938883 \\
\bottomrule
\end{tabular}



In [None]:
print(pd.DataFrame(cv_scores_rf).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.934483 & 0.934483 & 0.925555 & 0.945973 & 0.935549 & 0.959291 \\
std & 0.002859 & 0.011437 & 0.018584 & 0.012504 & 0.011448 & 0.008782 \\
min & 0.930172 & 0.922414 & 0.909396 & 0.929293 & 0.923339 & 0.945732 \\
max & 0.937500 & 0.951724 & 0.956376 & 0.961131 & 0.953177 & 0.968442 \\
\bottomrule
\end{tabular}



In [None]:
print(pd.DataFrame(cv_scores_xgbc).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.936897 & 0.933103 & 0.923539 & 0.945228 & 0.934151 & 0.962129 \\
std & 0.001869 & 0.011200 & 0.018143 & 0.013075 & 0.011241 & 0.009516 \\
min & 0.934914 & 0.920690 & 0.906040 & 0.928814 & 0.921502 & 0.947868 \\
max & 0.939224 & 0.946552 & 0.953020 & 0.961131 & 0.948247 & 0.972851 \\
\bottomrule
\end{tabular}



In [None]:
print(comparison_df.to_latex().replace("_", "\\_"))

\begin{tabular}{llrrrrr}
\toprule
 & Model & Mean Test Accuracy & Mean F1 & Mean Recall & Mean Precision & Mean ROC AUC \\
\midrule
0 & Logistic Regression & 0.934483 & 0.935549 & 0.925555 & 0.945973 & 0.909680 \\
1 & Random Forest & 0.934483 & 0.935549 & 0.925555 & 0.945973 & 0.959291 \\
2 & XGBoost & 0.933103 & 0.934151 & 0.923539 & 0.945228 & 0.962129 \\
\bottomrule
\end{tabular}



In [None]:

ft_cols_qualitative = [
    'Stage_fear',
    #'Drained_after_socializing',
]

ft_cols_qualitative_to_one_hot = [
    
]

ft_cols_quantitative = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency'
]

target_col = 'Drained_after_socializing' ################

comparison_drain_df, cv_drain_scores_logistic, cv_drain_scores_rf, cv_drain_scores_xgbc = run_exp(df_raw, ft_cols_qualitative, ft_cols_qualitative_to_one_hot, ft_cols_quantitative, target_col)
comparison_drain_df

Running Logistic Regression CV: 100%|██████████| 63/63 [00:14<00:00,  4.24it/s]


Best parameters LogClass: {'max_iter': 10, 'cs': 1, 'test_acc': np.float64(0.9882758620689653), 'cv_scores': {'fit_time': array([0.01651669, 0.01751685, 0.01651669, 0.01651669, 0.01751685]), 'score_time': array([0.01164699, 0.01264668, 0.01164699, 0.01164699, 0.01164675]), 'test_acc': array([0.9862069 , 0.9862069 , 0.98965517, 0.99310345, 0.9862069 ]), 'train_acc': array([0.9887931 , 0.9887931 , 0.98793103, 0.98706897, 0.9887931 ]), 'test_recall': array([1., 1., 1., 1., 1.]), 'train_recall': array([1., 1., 1., 1., 1.]), 'test_precision': array([0.97231834, 0.97231834, 0.97909408, 0.98601399, 0.97241379]), 'train_precision': array([0.97743056, 0.97743056, 0.97573657, 0.97402597, 0.97741095]), 'test_f1': array([0.98596491, 0.98596491, 0.98943662, 0.99295775, 0.98601399]), 'train_f1': array([0.98858648, 0.98858648, 0.9877193 , 0.98684211, 0.98857645]), 'test_roc_auc': array([0.98547947, 0.98801462, 0.99388234, 0.99122995, 0.98986744]), 'train_roc_auc': array([0.99082372, 0.99012268, 0.988

100%|██████████| 64/64 [00:17<00:00,  3.58it/s]


Best parameters: {'max_depth': 2, 'n_estimators': 50, 'min_samples_leaf': 3, 'test_acc': np.float64(0.9882758620689653), 'cv_scores': {'fit_time': array([0.07733774, 0.07298779, 0.07499528, 0.07499528, 0.07733774]), 'score_time': array([0.01801229, 0.02035022, 0.01834273, 0.02527118, 0.01701212]), 'test_acc': array([0.9862069 , 0.9862069 , 0.98965517, 0.99310345, 0.9862069 ]), 'train_acc': array([0.9887931 , 0.9887931 , 0.98793103, 0.98706897, 0.9887931 ]), 'test_recall': array([1., 1., 1., 1., 1.]), 'train_recall': array([1., 1., 1., 1., 1.]), 'test_precision': array([0.97231834, 0.97231834, 0.97909408, 0.98601399, 0.97241379]), 'train_precision': array([0.97743056, 0.97743056, 0.97573657, 0.97402597, 0.97741095]), 'test_f1': array([0.98596491, 0.98596491, 0.98943662, 0.99295775, 0.98601399]), 'train_f1': array([0.98858648, 0.98858648, 0.9877193 , 0.98684211, 0.98857645]), 'test_roc_auc': array([0.9833252 , 0.98586629, 0.98797891, 0.99288995, 0.97922319]), 'train_roc_auc': array([0.99

100%|██████████| 576/576 [00:46<00:00, 12.46it/s]

Best parameters XGB: {'max_depth': 6, 'n_estimators': 50, 'reg_lambda': 1, 'reg_aplha': 2, 'learning_rate': 1, 'test_acc': np.float64(0.9882758620689653), 'cv_scores': {'fit_time': array([0.02084589, 0.01866984, 0.02092552, 0.0165112 , 0.01994395]), 'score_time': array([0.01961279, 0.01437902, 0.01838422, 0.02061176, 0.01734519]), 'test_acc': array([0.9862069 , 0.9862069 , 0.98965517, 0.99310345, 0.9862069 ]), 'train_acc': array([0.9887931 , 0.9887931 , 0.98793103, 0.98706897, 0.9887931 ]), 'test_recall': array([1., 1., 1., 1., 1.]), 'train_recall': array([1., 1., 1., 1., 1.]), 'test_precision': array([0.97231834, 0.97231834, 0.97909408, 0.98601399, 0.97241379]), 'train_precision': array([0.97743056, 0.97743056, 0.97573657, 0.97402597, 0.97741095]), 'test_f1': array([0.98596491, 0.98596491, 0.98943662, 0.99295775, 0.98601399]), 'train_f1': array([0.98858648, 0.98858648, 0.9877193 , 0.98684211, 0.98857645]), 'test_roc_auc': array([0.98578298, 0.98920482, 0.98982373, 0.99211052, 0.987892




Unnamed: 0,Model,Mean Test Accuracy,Mean F1,Mean Recall,Mean Precision,Mean ROC AUC
0,Logistic Regression,0.988276,0.988068,1.0,0.976432,0.989695
1,Random Forest,0.988276,0.988068,1.0,0.976432,0.985857
2,XGBoost,0.988276,0.988068,1.0,0.976432,0.988963


In [None]:
print(pd.DataFrame(cv_drain_scores_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.988276 & 0.988276 & 1.000000 & 0.976432 & 0.988068 & 0.989695 \\
std & 0.000771 & 0.003084 & 0.000000 & 0.006101 & 0.003116 & 0.003183 \\
min & 0.987069 & 0.986207 & 1.000000 & 0.972318 & 0.985965 & 0.985479 \\
max & 0.988793 & 0.993103 & 1.000000 & 0.986014 & 0.992958 & 0.993882 \\
\bottomrule
\end{tabular}



In [None]:
print(pd.DataFrame(cv_drain_scores_rf).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.988276 & 0.988276 & 1.000000 & 0.976432 & 0.988068 & 0.985857 \\
std & 0.000771 & 0.003084 & 0.000000 & 0.006101 & 0.003116 & 0.005108 \\
min & 0.987069 & 0.986207 & 1.000000 & 0.972318 & 0.985965 & 0.979223 \\
max & 0.988793 & 0.993103 & 1.000000 & 0.986014 & 0.992958 & 0.992890 \\
\bottomrule
\end{tabular}



In [None]:
print(pd.DataFrame(cv_drain_scores_xgbc).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.988276 & 0.988276 & 1.000000 & 0.976432 & 0.988068 & 0.988963 \\
std & 0.000771 & 0.003084 & 0.000000 & 0.006101 & 0.003116 & 0.002343 \\
min & 0.987069 & 0.986207 & 1.000000 & 0.972318 & 0.985965 & 0.985783 \\
max & 0.988793 & 0.993103 & 1.000000 & 0.986014 & 0.992958 & 0.992111 \\
\bottomrule
\end{tabular}



In [None]:
print(comparison_drain_df.to_latex().replace("_", "\\_"))

\begin{tabular}{llrrrrr}
\toprule
 & Model & Mean Test Accuracy & Mean F1 & Mean Recall & Mean Precision & Mean ROC AUC \\
\midrule
0 & Logistic Regression & 0.988276 & 0.988068 & 1.000000 & 0.976432 & 0.989695 \\
1 & Random Forest & 0.988276 & 0.988068 & 1.000000 & 0.976432 & 0.985857 \\
2 & XGBoost & 0.988276 & 0.988068 & 1.000000 & 0.976432 & 0.988963 \\
\bottomrule
\end{tabular}



In [10]:

ft_cols_qualitative = [
    'Drained_after_socializing',
]

ft_cols_qualitative_to_one_hot = [
    
]

ft_cols_quantitative = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency'
]

target_col = 'Stage_fear' ################

comparison_stage_df, cv_scores_stage_logistic, cv_scores_stage_rf, cv_scores_stage_xgbc = run_exp(df_raw, ft_cols_qualitative, ft_cols_qualitative_to_one_hot, ft_cols_quantitative, target_col)
comparison_stage_df

Running Logistic Regression CV: 100%|██████████| 63/63 [00:13<00:00,  4.57it/s]


Best parameters LogClass: {'max_iter': 10, 'cs': 1, 'test_acc': np.float64(0.9893103448275863), 'cv_scores': {'fit_time': array([0.02945518, 0.02844834, 0.0304637 , 0.02952456, 0.02703142]), 'score_time': array([0.01437616, 0.01437616, 0.01437616, 0.01484656, 0.01437616]), 'test_acc': array([0.9862069 , 0.99310345, 0.99137931, 0.99137931, 0.98448276]), 'train_acc': array([0.99008621, 0.98836207, 0.9887931 , 0.9887931 , 0.99051724]), 'test_recall': array([1., 1., 1., 1., 1.]), 'train_recall': array([1., 1., 1., 1., 1.]), 'test_precision': array([0.97241379, 0.98601399, 0.9825784 , 0.9825784 , 0.96907216]), 'train_precision': array([0.98001738, 0.97662338, 0.97746967, 0.97746967, 0.98086957]), 'test_f1': array([0.98601399, 0.99295775, 0.99121265, 0.99121265, 0.98429319]), 'train_f1': array([0.98990785, 0.98817346, 0.98860649, 0.98860649, 0.99034241]), 'test_roc_auc': array([0.99127755, 0.99244966, 0.9930863 , 0.99200938, 0.9895283 ]), 'train_roc_auc': array([0.99178328, 0.99139171, 0.991

100%|██████████| 64/64 [00:18<00:00,  3.55it/s]


Best parameters: {'max_depth': 2, 'n_estimators': 50, 'min_samples_leaf': 3, 'test_acc': np.float64(0.9893103448275863), 'cv_scores': {'fit_time': array([0.07821512, 0.07922673, 0.07373714, 0.06973791, 0.07672   ]), 'score_time': array([0.02203035, 0.02103114, 0.02055907, 0.02055931, 0.02113152]), 'test_acc': array([0.9862069 , 0.99310345, 0.99137931, 0.99137931, 0.98448276]), 'train_acc': array([0.99008621, 0.98836207, 0.9887931 , 0.9887931 , 0.99051724]), 'test_recall': array([1., 1., 1., 1., 1.]), 'train_recall': array([1., 1., 1., 1., 1.]), 'test_precision': array([0.97241379, 0.98601399, 0.9825784 , 0.9825784 , 0.96907216]), 'train_precision': array([0.98001738, 0.97662338, 0.97746967, 0.97746967, 0.98086957]), 'test_f1': array([0.98601399, 0.99295775, 0.99121265, 0.99121265, 0.98429319]), 'train_f1': array([0.98990785, 0.98817346, 0.98860649, 0.98860649, 0.99034241]), 'test_roc_auc': array([0.99089676, 0.99342544, 0.9928721 , 0.99367533, 0.98819554]), 'train_roc_auc': array([0.99

100%|██████████| 576/576 [00:55<00:00, 10.40it/s]

Best parameters XGB: {'max_depth': 6, 'n_estimators': 50, 'reg_lambda': 0.5, 'reg_aplha': 2, 'learning_rate': 0.5, 'test_acc': np.float64(0.9886206896551725), 'cv_scores': {'fit_time': array([0.01735973, 0.02027273, 0.0143559 , 0.01926613, 0.02027464]), 'score_time': array([0.01390147, 0.02125001, 0.01489067, 0.01627731, 0.02124047]), 'test_acc': array([0.9862069 , 0.99310345, 0.99137931, 0.99137931, 0.98103448]), 'train_acc': array([0.99008621, 0.98836207, 0.9887931 , 0.9887931 , 0.99051724]), 'test_recall': array([1.       , 1.       , 1.       , 1.       , 0.9929078]), 'train_recall': array([1., 1., 1., 1., 1.]), 'test_precision': array([0.97241379, 0.98601399, 0.9825784 , 0.9825784 , 0.96885813]), 'train_precision': array([0.98001738, 0.97662338, 0.97746967, 0.97746967, 0.98086957]), 'test_f1': array([0.98601399, 0.99295775, 0.99121265, 0.99121265, 0.98073555]), 'train_f1': array([0.98990785, 0.98817346, 0.98860649, 0.98860649, 0.99034241]), 'test_roc_auc': array([0.98893926, 0.993




Unnamed: 0,Model,Mean Test Accuracy,Mean F1,Mean Recall,Mean Precision,Mean ROC AUC
0,Logistic Regression,0.98931,0.989138,1.0,0.978531,0.99167
1,Random Forest,0.98931,0.989138,1.0,0.978531,0.991813
2,XGBoost,0.988621,0.988427,0.998582,0.978489,0.989972


In [11]:
print(pd.DataFrame(cv_scores_stage_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.989310 & 0.989310 & 1.000000 & 0.978531 & 0.989138 & 0.991670 \\
std & 0.000934 & 0.003738 & 0.000000 & 0.007342 & 0.003756 & 0.001367 \\
min & 0.988362 & 0.984483 & 1.000000 & 0.969072 & 0.984293 & 0.989528 \\
max & 0.990517 & 0.993103 & 1.000000 & 0.986014 & 0.992958 & 0.993086 \\
\bottomrule
\end{tabular}



In [12]:
print(pd.DataFrame(cv_scores_stage_rf).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.989310 & 0.989310 & 1.000000 & 0.978531 & 0.989138 & 0.991813 \\
std & 0.000934 & 0.003738 & 0.000000 & 0.007342 & 0.003756 & 0.002298 \\
min & 0.988362 & 0.984483 & 1.000000 & 0.969072 & 0.984293 & 0.988196 \\
max & 0.990517 & 0.993103 & 1.000000 & 0.986014 & 0.992958 & 0.993675 \\
\bottomrule
\end{tabular}



In [13]:
print(pd.DataFrame(cv_scores_stage_xgbc).describe(percentiles=[]).loc[["mean", "std", "min", "max"]][
        ['train_acc', 'test_acc', 'test_recall', 'test_precision', 'test_f1', 'test_roc_auc']
    ].to_latex().replace('_', '\\_'))

\begin{tabular}{lrrrrrr}
\toprule
 & train\_acc & test\_acc & test\_recall & test\_precision & test\_f1 & test\_roc\_auc \\
\midrule
mean & 0.989310 & 0.988621 & 0.998582 & 0.978489 & 0.988427 & 0.989972 \\
std & 0.000934 & 0.004967 & 0.003172 & 0.007412 & 0.005026 & 0.002688 \\
min & 0.988362 & 0.981034 & 0.992908 & 0.968858 & 0.980736 & 0.986292 \\
max & 0.990517 & 0.993103 & 1.000000 & 0.986014 & 0.992958 & 0.993681 \\
\bottomrule
\end{tabular}



In [14]:
print(comparison_stage_df.to_latex().replace("_", "\\_"))

\begin{tabular}{llrrrrr}
\toprule
 & Model & Mean Test Accuracy & Mean F1 & Mean Recall & Mean Precision & Mean ROC AUC \\
\midrule
0 & Logistic Regression & 0.989310 & 0.989138 & 1.000000 & 0.978531 & 0.991670 \\
1 & Random Forest & 0.989310 & 0.989138 & 1.000000 & 0.978531 & 0.991813 \\
2 & XGBoost & 0.988621 & 0.988427 & 0.998582 & 0.978489 & 0.989972 \\
\bottomrule
\end{tabular}

