# Setup and Imports

In [None]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
import time

from sklearn.preprocessing import StandardScaler, PowerTransformer

In [3]:
(X, y) = t.from_pkl(f'{processed_data_path}/all_b1_b2_xyz_distance_cluster1.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 72)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__p01', 'x__p10', 'x__p20', 'x__p30', 'x__p99', 'x__p90',
       'x__p80', 'x__p70', 'x__iqr', 'x__kurtosis', 'x__skew', 'y__amin',
       'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std', 'y__var',
       'y__p01', 'y__p10', 'y__p20', 'y__p30', 'y__p99', 'y__p90', 'y__p80',
       'y__p70', 'y__iqr', 'y__kurtosis', 'y__skew', 'z__amin', 'z__amax',
       'z__sum', 'z__median', 'z__mean', 'z__std', 'z__var', 'z__p01',
       'z__p10', 'z__p20', 'z__p30', 'z__p99', 'z__p90', 'z__p80', 'z__p70',
       'z__iqr', 'z__kurtosis', 'z__skew', 'xyz__amin', 'xyz__amax',
       'xyz__sum', 'xyz__median', 'xyz__mean', 'xyz__std', 'xyz__var',
       'xyz__p01', 'xyz__p10', 'xyz__p20', 'xyz__p30', 'xyz__p99', 'xyz__p90',
       'xyz__p80', 'xyz__p70', 'xyz__iqr', 'xyz__kurtosis', 'xyz__skew'],
      dtype='object')

(200,)

0    100
1    100
Name: status, dtype: int64

# CV

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, log_loss

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

# First Run

In [6]:
n_est_list = [100, 200]
models_and_params = [
    (DecisionTreeClassifier, {}),
    (KNeighborsClassifier, {}),
    
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBClassifier,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
metrics = [ roc_auc_score, log_loss ]

_ = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,DecisionTreeClassifier,,25,0.851,0.036,1,5.15,1.26,12,0.2
1,KNeighborsClassifier,,25,0.934,0.031,2,1.45,0.89,11,0.2
2,RandomForestClassifier,'n_estimators': 100,25,0.974,0.022,3,0.24,0.04,8,4.0
3,RandomForestClassifier,'n_estimators': 200,25,0.976,0.019,6,0.24,0.04,7,7.7
4,AdaBoostClassifier,'n_estimators': 100,25,0.986,0.017,10,0.44,0.02,9,5.0
5,AdaBoostClassifier,'n_estimators': 200,25,0.99,0.015,12,0.46,0.02,10,9.8
6,LGBMClassifier,'n_estimators': 100,25,0.986,0.02,9,0.18,0.15,1,0.8
7,LGBMClassifier,'n_estimators': 200,25,0.988,0.017,11,0.23,0.23,6,1.2
8,XGBClassifier,'n_estimators': 100,25,0.974,0.02,5,0.22,0.1,5,5.8
9,XGBClassifier,'n_estimators': 200,25,0.974,0.02,3,0.22,0.1,4,10.7


# Shortlisted Best 3

In [7]:
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.993,0.01,5,0.472,0.013,5,16.1
1,AdaBoostClassifier,'n_estimators': 1600,10,0.993,0.009,6,0.476,0.013,6,31.2
2,LGBMClassifier,'n_estimators': 800,10,0.99,0.012,3,0.186,0.142,4,1.0
3,LGBMClassifier,'n_estimators': 1600,10,0.99,0.012,3,0.186,0.142,3,1.1
4,CatBoostClassifier,'n_estimators': 800,10,0.987,0.011,2,0.165,0.05,2,39.5
5,CatBoostClassifier,'n_estimators': 1600,10,0.986,0.011,1,0.163,0.053,1,84.9


In [8]:
def get_fe_df(list_of_list_of_models):
    df_all_fe = []
    
    for i, models_to_analyse in enumerate(list_of_list_of_models):
        feature_imp_for_model = []

        for j, m in enumerate(models_to_analyse[:3]):
            model_short_name = ''.join([l for l in m.__class__.__name__ if l.isupper()])

            if hasattr(m, 'feature_names_in_'):
                feature_names = m.feature_names_in_
            elif hasattr(m, 'feature_name_'):
                feature_names = m.feature_name_
            else:
                feature_names = m.feature_names_

            feature_imp_for_model.append(
                pd.Series(index = feature_names, data = m.feature_importances_, name=f'{model_short_name}_{i}_{j}')
            )

        df_f_imp = pd.concat(feature_imp_for_model, axis=1)
        df_f_imp[f'{model_short_name}_{i}_sum'] = df_f_imp.sum(axis=1)
        df_f_imp[f'{model_short_name}_{i}_rank'] = df_f_imp[f'{model_short_name}_{i}_sum'].rank().astype(int)

        # df_f_imp.sort_values(by=[f'{model_short_name}_sum'], ascending = False)
        df_all_fe.append(df_f_imp)

        df_final_fe = pd.concat(df_all_fe, axis=1)
        
        sum_cols = [col for col in df_final_fe.columns if col.endswith('_sum')]
    df_final_fe['sum'] = df_final_fe[sum_cols].sum(axis=1)
    df_final_fe = df_final_fe.sort_values(by='sum', ascending=False)


    rank_cols = [col for col in df_final_fe.columns if col.endswith('_rank')]
    styler = df_final_fe.style
    styler.background_gradient(subset=rank_cols, cmap=plt.cm.Oranges, vmin=-5)
    return styler

In [9]:
df_final_fe = get_fe_df(all_trained_models)
df_final_fe

Unnamed: 0,ABC_0_0,ABC_0_1,ABC_0_2,ABC_0_sum,ABC_0_rank,ABC_1_0,ABC_1_1,ABC_1_2,ABC_1_sum,ABC_1_rank,LGBMC_2_0,LGBMC_2_1,LGBMC_2_2,LGBMC_2_sum,LGBMC_2_rank,LGBMC_3_0,LGBMC_3_1,LGBMC_3_2,LGBMC_3_sum,LGBMC_3_rank,CBC_4_0,CBC_4_1,CBC_4_2,CBC_4_sum,CBC_4_rank,CBC_5_0,CBC_5_1,CBC_5_2,CBC_5_sum,CBC_5_rank,sum
x__skew,0.09125,0.07375,0.0525,0.2175,71,0.085625,0.069375,0.05625,0.21125,71,129,72,68,269,72,129,72,68,269,72,10.925922,6.646765,6.789848,24.362535,71,11.078329,6.958247,7.05548,25.092056,71,587.883341
z__sum,0.0675,0.0275,0.035,0.13,66,0.066875,0.031875,0.033125,0.131875,67,91,70,66,227,70,91,70,66,227,70,6.360105,4.050483,5.388289,15.798876,70,6.173729,3.554208,5.414291,15.142229,70,485.20298
z__skew,0.07125,0.055,0.06625,0.1925,70,0.070625,0.058125,0.06625,0.195,70,90,71,70,231,71,90,71,70,231,71,2.950732,3.548929,2.889563,9.389223,64,2.942946,3.92497,2.818392,9.686308,65,481.463032
xyz__amin,0.0425,0.045,0.0475,0.135,67,0.040625,0.0475,0.041875,0.13,66,70,79,58,207,69,70,79,58,207,69,7.766297,10.507749,6.789162,25.063208,72,7.868606,10.795634,6.638542,25.302782,72,464.63099
y__sum,0.04375,0.02,0.0075,0.07125,58,0.041875,0.0175,0.01,0.069375,58,112,47,45,204,68,112,47,45,204,68,3.156274,1.573082,0.928839,5.658195,60,3.473214,2.142286,1.137943,6.753444,61,420.552264
x__sum,0.07375,0.1075,0.115,0.29625,72,0.0775,0.100625,0.11,0.288125,72,28,92,70,190,67,28,92,70,190,67,1.881497,4.60212,6.950978,13.434595,68,2.187057,5.429636,7.463734,15.080427,69,409.099398
x__kurtosis,0.0375,0.0175,0.02875,0.08375,62,0.035,0.01625,0.02875,0.08,61,62,31,78,171,66,62,31,78,171,66,5.879409,3.503727,4.636537,14.019672,69,5.834983,3.342349,4.366692,13.544024,68,369.727446
y__skew,0.03125,0.0775,0.04375,0.1525,69,0.031875,0.07,0.0475,0.149375,69,37,75,52,164,65,37,75,52,164,65,0.86504,2.751421,1.739222,5.355684,59,0.986864,2.538042,1.979387,5.504293,59,339.161852
z__p99,0.02375,0.03625,0.03875,0.09875,65,0.021875,0.04125,0.038125,0.10125,65,30,37,54,121,64,30,37,54,121,64,2.755975,2.234315,2.844342,7.834632,62,2.315313,2.725843,3.525199,8.566356,63,258.600988
x__p30,0.0075,0.00375,0.0075,0.01875,38,0.01,0.005625,0.01375,0.029375,43,29,54,28,111,63,29,54,28,111,63,2.011233,3.692486,3.205849,8.909569,63,1.772376,3.394726,3.174301,8.341403,62,239.299097


## Best 10 features

In [10]:
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:10]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.994,0.005,5,0.459,0.02,5,11.9
1,AdaBoostClassifier,'n_estimators': 1600,10,0.995,0.005,6,0.459,0.022,6,22.6
2,LGBMClassifier,'n_estimators': 800,10,0.99,0.009,3,0.191,0.124,4,0.8
3,LGBMClassifier,'n_estimators': 1600,10,0.99,0.009,3,0.191,0.123,3,1.3
4,CatBoostClassifier,'n_estimators': 800,10,0.989,0.011,1,0.123,0.043,2,27.2
5,CatBoostClassifier,'n_estimators': 1600,10,0.989,0.01,2,0.12,0.043,1,56.2


In [11]:
## Best 20 features

In [12]:
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:20]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.994,0.005,5,0.455,0.019,5,11.7
1,AdaBoostClassifier,'n_estimators': 1600,10,0.995,0.006,6,0.458,0.017,6,24.0
2,LGBMClassifier,'n_estimators': 800,10,0.992,0.009,3,0.14,0.122,3,0.8
3,LGBMClassifier,'n_estimators': 1600,10,0.992,0.009,3,0.14,0.122,4,1.3
4,CatBoostClassifier,'n_estimators': 800,10,0.992,0.008,1,0.123,0.038,2,30.8
5,CatBoostClassifier,'n_estimators': 1600,10,0.992,0.008,1,0.121,0.04,1,65.6


In [16]:
df_final_fe.index.to_list()[:30]

['x__skew',
 'z__sum',
 'z__skew',
 'xyz__amin',
 'y__sum',
 'x__sum',
 'x__kurtosis',
 'y__skew',
 'z__p99',
 'x__p30',
 'x__p70',
 'z__mean',
 'x__median',
 'xyz__kurtosis',
 'y__p70',
 'x__amax',
 'z__kurtosis',
 'z__amax',
 'x__mean',
 'y__mean',
 'z__p10',
 'y__amin',
 'z__p70',
 'y__p30',
 'y__median',
 'z__amin',
 'y__kurtosis',
 'x__amin',
 'y__p90',
 'z__median']