# Setup and Imports

In [1]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [2]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [3]:
import time

from sklearn.preprocessing import StandardScaler, PowerTransformer

In [4]:
(X, y) = t.from_pkl(f'{processed_data_path}/all_b1_b2.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 54)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__p01', 'x__p10', 'x__p20', 'x__p30', 'x__p99', 'x__p90',
       'x__p80', 'x__p70', 'x__iqr', 'x__kurtosis', 'x__skew', 'y__amin',
       'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std', 'y__var',
       'y__p01', 'y__p10', 'y__p20', 'y__p30', 'y__p99', 'y__p90', 'y__p80',
       'y__p70', 'y__iqr', 'y__kurtosis', 'y__skew', 'z__amin', 'z__amax',
       'z__sum', 'z__median', 'z__mean', 'z__std', 'z__var', 'z__p01',
       'z__p10', 'z__p20', 'z__p30', 'z__p99', 'z__p90', 'z__p80', 'z__p70',
       'z__iqr', 'z__kurtosis', 'z__skew'],
      dtype='object')

(200,)

0    100
1    100
Name: status, dtype: int64

# CV

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, log_loss

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

# First Run

In [6]:
n_est_list = [100, 200, 400]
models_and_params = [
    (DecisionTreeClassifier, {}),
    (KNeighborsClassifier, {}),
    
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBClassifier,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
metrics = [ roc_auc_score, log_loss ]

_ = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,DecisionTreeClassifier,,25,0.9,0.042,1,3.45,1.44,17,0.2
1,KNeighborsClassifier,,25,0.944,0.037,2,1.02,0.8,16,0.2
2,RandomForestClassifier,'n_estimators': 100,25,0.983,0.015,8,0.2,0.04,7,3.9
3,RandomForestClassifier,'n_estimators': 200,25,0.983,0.015,8,0.2,0.04,5,7.5
4,RandomForestClassifier,'n_estimators': 400,25,0.983,0.017,7,0.2,0.05,6,14.7
5,AdaBoostClassifier,'n_estimators': 100,25,0.977,0.028,6,0.42,0.03,13,4.4
6,AdaBoostClassifier,'n_estimators': 200,25,0.983,0.02,10,0.43,0.02,14,8.6
7,AdaBoostClassifier,'n_estimators': 400,25,0.984,0.019,11,0.44,0.03,15,17.2
8,LGBMClassifier,'n_estimators': 100,25,0.984,0.018,14,0.19,0.15,4,0.7
9,LGBMClassifier,'n_estimators': 200,25,0.984,0.02,12,0.27,0.24,11,1.1


# Shortlisted Best 3

In [12]:
n_est_list = [800, 1600, 3200]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.988,0.017,8,0.443,0.016,7,13.9
1,AdaBoostClassifier,'n_estimators': 1600,10,0.988,0.017,7,0.447,0.017,8,27.4
2,AdaBoostClassifier,'n_estimators': 3200,10,0.988,0.016,9,0.448,0.019,9,54.5
3,LGBMClassifier,'n_estimators': 800,10,0.986,0.016,4,0.25,0.198,5,0.9
4,LGBMClassifier,'n_estimators': 1600,10,0.986,0.016,4,0.25,0.198,5,1.3
5,LGBMClassifier,'n_estimators': 3200,10,0.986,0.016,4,0.25,0.198,5,2.0
6,CatBoostClassifier,'n_estimators': 800,10,0.986,0.014,2,0.158,0.066,3,37.1
7,CatBoostClassifier,'n_estimators': 1600,10,0.986,0.014,1,0.158,0.067,2,77.5
8,CatBoostClassifier,'n_estimators': 3200,10,0.986,0.013,6,0.158,0.07,1,151.3


In [31]:
def get_fe_df(list_of_list_of_models):
    df_all_fe = []
    
    for i, models_to_analyse in enumerate(list_of_list_of_models):
        feature_imp_for_model = []

        for j, m in enumerate(models_to_analyse[:3]):
            model_short_name = ''.join([l for l in m.__class__.__name__ if l.isupper()])

            if hasattr(m, 'feature_names_in_'):
                feature_names = m.feature_names_in_
            elif hasattr(m, 'feature_name_'):
                feature_names = m.feature_name_
            else:
                feature_names = m.feature_names_

            feature_imp_for_model.append(
                pd.Series(index = feature_names, data = m.feature_importances_, name=f'{model_short_name}_{i}_{j}')
            )

        df_f_imp = pd.concat(feature_imp_for_model, axis=1)
        df_f_imp[f'{model_short_name}_{i}_sum'] = df_f_imp.sum(axis=1)
        df_f_imp[f'{model_short_name}_{i}_rank'] = df_f_imp[f'{model_short_name}_{i}_sum'].rank().astype(int)

        # df_f_imp.sort_values(by=[f'{model_short_name}_sum'], ascending = False)
        df_all_fe.append(df_f_imp)

        df_final_fe = pd.concat(df_all_fe, axis=1)
        
        sum_cols = [col for col in df_final_fe.columns if col.endswith('_sum')]
    df_final_fe['sum'] = df_final_fe[sum_cols].sum(axis=1)
    df_final_fe = df_final_fe.sort_values(by='sum', ascending=False)


    rank_cols = [col for col in df_final_fe.columns if col.endswith('_rank')]
    styler = df_final_fe.style
    styler.background_gradient(subset=rank_cols, cmap=plt.cm.Oranges, vmin=-5)
    return styler

In [35]:
df_final_fe = get_fe_df(all_trained_models)
df_final_fe

Unnamed: 0,ABC_0_0,ABC_0_1,ABC_0_2,ABC_0_sum,ABC_0_rank,ABC_1_0,ABC_1_1,ABC_1_2,ABC_1_sum,ABC_1_rank,ABC_2_0,ABC_2_1,ABC_2_2,ABC_2_sum,ABC_2_rank,LGBMC_3_0,LGBMC_3_1,LGBMC_3_2,LGBMC_3_sum,LGBMC_3_rank,LGBMC_4_0,LGBMC_4_1,LGBMC_4_2,LGBMC_4_sum,LGBMC_4_rank,LGBMC_5_0,LGBMC_5_1,LGBMC_5_2,LGBMC_5_sum,LGBMC_5_rank,CBC_6_0,CBC_6_1,CBC_6_2,CBC_6_sum,CBC_6_rank,CBC_7_0,CBC_7_1,CBC_7_2,CBC_7_sum,CBC_7_rank,CBC_8_0,CBC_8_1,CBC_8_2,CBC_8_sum,CBC_8_rank,sum
x__skew,0.065,0.0375,0.06625,0.16875,52,0.058125,0.0375,0.07,0.165625,52,0.059687,0.03625,0.061875,0.157812,52,139,73,83,295,54,139,73,83,295,54,139,73,83,295,54,9.425285,6.233313,4.769769,20.428367,53,9.615407,6.067858,4.744898,20.428163,53,9.729977,6.678263,4.87983,21.28807,53,947.636788
z__sum,0.0575,0.01875,0.02375,0.1,47,0.05375,0.020625,0.026875,0.10125,47,0.055,0.02375,0.028125,0.106875,48,107,77,74,258,53,107,77,74,258,53,107,77,74,258,53,5.442589,4.202057,3.08995,12.734596,51,5.254826,3.964798,2.957477,12.177101,51,5.776498,3.794311,3.030993,12.601803,51,811.821625
x__median,0.035,0.04375,0.05375,0.1325,50,0.03625,0.048125,0.05375,0.138125,51,0.03625,0.05125,0.051875,0.139375,51,38,57,90,185,50,38,57,90,185,50,38,57,90,185,50,6.389039,12.058615,18.135465,36.583119,54,6.817736,11.950881,17.523165,36.291783,54,6.215581,11.634412,17.604214,35.454208,54,663.739109
y__sum,0.0525,0.03125,0.0225,0.10625,48,0.048125,0.033125,0.023125,0.104375,48,0.051875,0.028125,0.02375,0.10375,47,112,45,48,205,52,112,45,48,205,52,112,45,48,205,52,3.283883,2.006472,1.0568,6.347155,39,3.255573,2.78334,1.07116,7.110073,42,3.323607,2.665259,0.958048,6.946914,42,635.718516
z__skew,0.07125,0.0725,0.0525,0.19625,54,0.06625,0.063125,0.056875,0.18625,54,0.065937,0.060937,0.0575,0.184375,54,67,67,56,190,51,67,67,56,190,51,67,67,56,190,51,2.428995,3.487223,1.801032,7.717249,45,2.483637,3.507539,2.021797,8.012973,44,2.423262,3.680597,2.034491,8.138349,44,594.435446
z__p99,0.0475,0.04,0.05125,0.13875,51,0.046875,0.029375,0.049375,0.125625,50,0.04875,0.03,0.046875,0.125625,49,65,57,61,183,49,65,57,61,183,49,65,57,61,183,49,3.650304,2.465609,2.426015,8.541927,46,3.56146,3.072479,2.663178,9.297117,47,3.387737,2.790519,2.884007,9.062262,47,576.291306
x__sum,0.035,0.05,0.025,0.11,49,0.038125,0.053125,0.0275,0.11875,49,0.039688,0.052812,0.033125,0.125625,49,36,56,49,141,48,36,56,49,141,48,36,56,49,141,48,2.497883,3.814402,5.237097,11.549382,49,2.248997,4.010943,5.545917,11.805857,49,2.409903,4.02111,4.912522,11.343535,48,458.053149
z__mean,0.0125,0.035,0.04625,0.09375,43,0.016875,0.034375,0.048125,0.099375,46,0.017188,0.031875,0.04875,0.097812,46,18,58,34,110,45,18,58,34,110,45,18,58,34,110,45,5.230476,3.948161,2.413355,11.591992,50,5.643379,3.671965,2.484491,11.799835,48,5.606917,4.193896,2.614456,12.415269,50,366.098033
y__p80,0.0425,0.0225,0.0175,0.0825,39,0.043125,0.01875,0.0175,0.079375,37,0.043437,0.020938,0.02,0.084375,39,55,25,31,111,46,55,25,31,111,46,55,25,31,111,46,2.091663,2.149503,3.460802,7.701968,44,2.164141,2.87522,3.311097,8.350459,45,2.27968,3.111871,3.256408,8.647959,45,357.946636
z__amax,0.0325,0.0275,0.01875,0.07875,38,0.030625,0.0325,0.0175,0.080625,38,0.039062,0.036562,0.0175,0.093125,45,33,27,41,101,44,33,27,41,101,44,33,27,41,101,44,5.440327,6.246596,4.908802,16.595726,52,5.487956,5.594688,4.393419,15.476063,52,5.495991,6.281721,4.214899,15.992611,52,351.3169


## Best 10 features

In [37]:
n_est_list = [800, 1600, 3200]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:10]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.989,0.014,3,0.413,0.03,7,11.0
1,AdaBoostClassifier,'n_estimators': 1600,10,0.988,0.015,1,0.417,0.031,9,21.4
2,AdaBoostClassifier,'n_estimators': 3200,10,0.988,0.015,2,0.416,0.031,8,43.8
3,LGBMClassifier,'n_estimators': 800,10,0.99,0.012,4,0.191,0.178,4,1.1
4,LGBMClassifier,'n_estimators': 1600,10,0.991,0.011,5,0.191,0.175,5,2.4
5,LGBMClassifier,'n_estimators': 3200,10,0.991,0.011,6,0.192,0.173,6,3.8
6,CatBoostClassifier,'n_estimators': 800,10,0.991,0.011,7,0.119,0.054,2,29.5
7,CatBoostClassifier,'n_estimators': 1600,10,0.991,0.011,7,0.12,0.057,3,59.8
8,CatBoostClassifier,'n_estimators': 3200,10,0.992,0.011,9,0.117,0.058,1,117.5


In [38]:
## Best 20 features

In [40]:
n_est_list = [3200]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:20]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 3200,10,0.991,0.013,2,0.419,0.022,3,45.7
1,LGBMClassifier,'n_estimators': 3200,10,0.989,0.014,1,0.229,0.178,2,2.2
2,CatBoostClassifier,'n_estimators': 3200,10,0.992,0.01,3,0.124,0.06,1,124.5


In [42]:
## Best 30 features

In [41]:
n_est_list = [3200]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:30]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 3200,10,0.989,0.017,3,0.442,0.022,3,50.9
1,LGBMClassifier,'n_estimators': 3200,10,0.986,0.016,1,0.241,0.192,2,1.7
2,CatBoostClassifier,'n_estimators': 3200,10,0.988,0.013,2,0.142,0.067,1,140.5
