In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import itertools
from tqdm import tqdm

In [14]:
df_raw = pd.read_csv('personality_datasert.csv')
df_raw

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,3.0,Yes,2.0,0.0,Introvert


In [15]:
# check for missing values
missing_value = df_raw.isnull().sum()
missing_value = missing_value[missing_value > 0]
missing_value

Series([], dtype: int64)

In [16]:
df_raw['Stage_fear'] = df_raw['Stage_fear'].map({'Yes': 1, 'No': 0})
df_raw['Drained_after_socializing'] = df_raw['Drained_after_socializing'].map({'Yes': 1, 'No': 0})
df_raw['Personality'] = df_raw['Personality'].map({'Introvert': 0, 'Extrovert':1})  

In [17]:
df_raw.head(2)

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,0,4.0,6.0,0,13.0,5.0,1
1,9.0,1,0.0,0.0,1,0.0,3.0,0


In [18]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler

def run_kfold_pipeline(
        df_raw: pd.DataFrame,
        ft_cols_qualitative: list[str],
        ft_cols_qualitative_to_one_hot: list[str],
        ft_cols_quantitative: list[str],
        target_col: str,
        scaler,
        model,
        n_splits: int = 5,
        random_state: int = 42,
        shuffle: bool = True,
        scoring=None
    ):
    
    binary_flag_cols = list(set(ft_cols_qualitative) - set(ft_cols_qualitative_to_one_hot))

    preprocessor = ColumnTransformer(
        transformers=[
            ("num",    scaler,                                   ft_cols_quantitative),
            ("onehot", OneHotEncoder(handle_unknown="ignore"),   ft_cols_qualitative_to_one_hot),
            ("binary", "passthrough",                            binary_flag_cols)
        ],
        remainder="drop"
    )

    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    X = df_raw.drop(columns=[target_col])
    y = df_raw[target_col]

    cv = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle)
    results = cross_validate(pipe, X, y, cv=cv, n_jobs=-1,
                             return_train_score=True, scoring=scoring)

    return results



In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def run_exp(ft_cols_qualitative, ft_cols_qualitative_to_one_hot, ft_cols_quantitative, target_col):
    list_max_iter = [10, 20, 30, 40, 50, 100, 200, 300, 500]
    list_cs = [1,2,3,4,5,8,10]
    best_parameters = {
        "max_iter": None,
        "cs": None,
        "test_acc": -math.inf,
        "cv_scores": None
    }

    for max_iter, cs in tqdm(itertools.product(list_max_iter, list_cs), 
                            total=len(list_max_iter) * len(list_cs), 
                            desc="Running Logistic Regression CV"):
        cv_run = run_kfold_pipeline(
            df_raw=df_raw,
            ft_cols_qualitative=ft_cols_qualitative,
            ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
            ft_cols_quantitative=ft_cols_quantitative,
            target_col=target_col,
            scaler=StandardScaler(),
            model=LogisticRegressionCV(
                max_iter=max_iter,
                Cs=cs,
            ),
            scoring = {
                "acc": "accuracy",
                "recall": "recall",
                "precision": "precision",
                "f1": "f1",
                "roc_auc": "roc_auc"
            }
        )
        
        mean_test_acc = cv_run["test_acc"].mean()
        
        if mean_test_acc > best_parameters["test_acc"]:
            best_parameters["max_iter"] = max_iter
            best_parameters["cs"] = cs
            best_parameters["test_acc"] = mean_test_acc
            best_parameters["cv_scores"] = cv_run

    print(f"Best parameters LogClass: {best_parameters}")
    cv_scores_logistic = best_parameters["cv_scores"]

    print(pd.DataFrame(cv_scores_logistic).describe(percentiles=[]).loc[["mean", "std", "min", "max"]])

    list_max_depth = [2, 3, 5, 10]
    list_n_estimators = [50, 100, 150, 200]
    list_min_samples_leaf = [3, 5, 10, 20]

    best_parameters = {
        "max_depth": None,
        "n_estimators": None,
        "min_samples_leaf": None,
        "test_acc": -math.inf,
        "cv_scores": None
    }

    for max_depth, n_estimators, min_samples_leaf in tqdm(
        itertools.product(list_max_depth, list_n_estimators, list_min_samples_leaf),
        total=len(list_max_depth) * len(list_n_estimators) * len(list_min_samples_leaf)
    ):    
        cv_run = run_kfold_pipeline(
            df_raw=df_raw,
            ft_cols_qualitative=ft_cols_qualitative,
            ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
            ft_cols_quantitative=ft_cols_quantitative,
            target_col=target_col,
            scaler=StandardScaler(),
            model=RandomForestClassifier(
                max_depth=max_depth,
                n_estimators=n_estimators,
                min_samples_leaf=min_samples_leaf,
            ),
            scoring = {
                "acc": "accuracy",
                "recall": "recall",
                "precision": "precision",
                "f1": "f1",
                "roc_auc": "roc_auc"
            }
        )

        mean_test_acc = cv_run['test_acc'].mean()
        if mean_test_acc > best_parameters["test_acc"]:
            best_parameters["max_depth"] = max_depth
            best_parameters["n_estimators"] = n_estimators
            best_parameters["min_samples_leaf"] = min_samples_leaf
            best_parameters["test_acc"] = mean_test_acc
            best_parameters["cv_scores"] = cv_run

    print(f"Best parameters: {best_parameters}")

    cv_scores_rf = best_parameters["cv_scores"]

    pd.DataFrame(cv_scores_rf).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]

    list_max_depth = [ 6, 7, 8, 9]
    list_lambda = [0.5, 1, 2]
    list_learning_rate = [1, 0.8, 0.5, 0.25]
    list_n_estimators = [50, 100, 150, 200]
    list_alpha = [0.5, 1, 2]

    best_parameters = {
        "max_depth": None,
        "n_estimators": None,
        "reg_lambda": None,
        "reg_aplha": None,
        "learning_rate": None,
        "test_acc": -math.inf,
        "cv_scores": None
    }

    for max_depth, n_estimators, reg_lambda, lr, alpha in tqdm(
        itertools.product(list_max_depth, list_n_estimators, list_lambda, list_learning_rate, list_alpha),
        total=len(list_max_depth) * len(list_n_estimators) * len(list_lambda) * len(list_learning_rate) * len(list_alpha)
    ):
        cv_run = run_kfold_pipeline(
            df_raw=df_raw,
            ft_cols_qualitative=ft_cols_qualitative,
            ft_cols_qualitative_to_one_hot=ft_cols_qualitative_to_one_hot,
            ft_cols_quantitative=ft_cols_quantitative,
            target_col=target_col,
            scaler=StandardScaler(),
            model=XGBClassifier(
                max_depth=max_depth,
                n_estimators=n_estimators,
                reg_lambda=reg_lambda,
                learning_rate=lr,
                reg_alpha=alpha,
            ),
            scoring = {
                "acc": "accuracy",
                "recall": "recall",
                "precision": "precision",
                "f1": "f1",
                "roc_auc": "roc_auc"
            }
        )

        mean_test_acc = cv_run['test_acc'].mean()
        if mean_test_acc > best_parameters["test_acc"]:
            best_parameters["max_depth"] = max_depth
            best_parameters["n_estimators"] = n_estimators
            best_parameters["reg_lambda"] = reg_lambda
            best_parameters["reg_aplha"] = alpha
            best_parameters["learning_rate"] = lr
            best_parameters["test_acc"] = mean_test_acc
            best_parameters["cv_scores"] = cv_run
            
    print(f"Best parameters XGB: {best_parameters}")

    cv_scores_xgbc = best_parameters["cv_scores"]

    pd.DataFrame(cv_scores_xgbc).describe(percentiles=[]).loc[["mean", "std", "min", "max"]]

In [20]:

ft_cols_qualitative = [
    'Stage_fear',
    'Drained_after_socializing',
]

ft_cols_qualitative_to_one_hot = [
    
]

ft_cols_quantitative = [
    'Time_spent_Alone',
    'Social_event_attendance',
    'Going_outside',
    'Friends_circle_size',
    'Post_frequency'
]

target_col = 'Personality' ################

run_exp(ft_cols_qualitative, ft_cols_qualitative_to_one_hot, ft_cols_quantitative, target_col)

Running Logistic Regression CV: 100%|██████████| 63/63 [00:13<00:00,  4.82it/s]


Best parameters LogClass: {'max_iter': 10, 'cs': 1, 'test_acc': np.float64(0.9344827586206896), 'cv_scores': {'fit_time': array([0.01874685, 0.02794361, 0.02845097, 0.01874685, 0.02738857]), 'score_time': array([0.01403737, 0.01403737, 0.01303887, 0.01403737, 0.01303887]), 'test_acc': array([0.9362069 , 0.92241379, 0.9362069 , 0.92586207, 0.95172414]), 'train_acc': array([0.93405172, 0.9375    , 0.93405172, 0.93663793, 0.93017241]), 'test_recall': array([0.92307692, 0.90939597, 0.91275168, 0.9261745 , 0.95637584]), 'train_recall': array([0.9261745 , 0.92958927, 0.92875105, 0.92539816, 0.91785415]), 'test_precision': array([0.95172414, 0.93771626, 0.96113074, 0.92929293, 0.95      ]), 'train_precision': array([0.94439692, 0.94786325, 0.94217687, 0.95008606, 0.94477998]), 'test_f1': array([0.93718166, 0.92333901, 0.9363167 , 0.92773109, 0.95317726]), 'train_f1': array([0.93519695, 0.93863733, 0.93541579, 0.93757962, 0.93112245]), 'test_roc_auc': array([0.91422773, 0.88958899, 0.91702366,

100%|██████████| 64/64 [00:18<00:00,  3.48it/s]


Best parameters: {'max_depth': 2, 'n_estimators': 50, 'min_samples_leaf': 3, 'test_acc': np.float64(0.9344827586206896), 'cv_scores': {'fit_time': array([0.06159568, 0.0585742 , 0.06008387, 0.06159568, 0.0585742 ]), 'score_time': array([0.01642346, 0.0244987 , 0.01793528, 0.02498651, 0.01893544]), 'test_acc': array([0.9362069 , 0.92241379, 0.9362069 , 0.92586207, 0.95172414]), 'train_acc': array([0.93405172, 0.9375    , 0.93405172, 0.93663793, 0.93017241]), 'test_recall': array([0.92307692, 0.90939597, 0.91275168, 0.9261745 , 0.95637584]), 'train_recall': array([0.9261745 , 0.92958927, 0.92875105, 0.92539816, 0.91785415]), 'test_precision': array([0.95172414, 0.93771626, 0.96113074, 0.92929293, 0.95      ]), 'train_precision': array([0.94439692, 0.94786325, 0.94217687, 0.95008606, 0.94477998]), 'test_f1': array([0.93718166, 0.92333901, 0.9363167 , 0.92773109, 0.95317726]), 'train_f1': array([0.93519695, 0.93863733, 0.93541579, 0.93757962, 0.93112245]), 'test_roc_auc': array([0.9568431 

  0%|          | 0/576 [00:00<?, ?it/s]


KeyError: "['score_label_as_int'] not found in axis"