# Setup and Imports

In [None]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [1]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [2]:
import time

from sklearn.preprocessing import StandardScaler, PowerTransformer

In [3]:
(X, y) = t.from_pkl(f'{processed_data_path}/all_b1_b2_xyz_distance.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 72)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__p01', 'x__p10', 'x__p20', 'x__p30', 'x__p99', 'x__p90',
       'x__p80', 'x__p70', 'x__iqr', 'x__kurtosis', 'x__skew', 'y__amin',
       'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std', 'y__var',
       'y__p01', 'y__p10', 'y__p20', 'y__p30', 'y__p99', 'y__p90', 'y__p80',
       'y__p70', 'y__iqr', 'y__kurtosis', 'y__skew', 'z__amin', 'z__amax',
       'z__sum', 'z__median', 'z__mean', 'z__std', 'z__var', 'z__p01',
       'z__p10', 'z__p20', 'z__p30', 'z__p99', 'z__p90', 'z__p80', 'z__p70',
       'z__iqr', 'z__kurtosis', 'z__skew', 'xyz__amin', 'xyz__amax',
       'xyz__sum', 'xyz__median', 'xyz__mean', 'xyz__std', 'xyz__var',
       'xyz__p01', 'xyz__p10', 'xyz__p20', 'xyz__p30', 'xyz__p99', 'xyz__p90',
       'xyz__p80', 'xyz__p70', 'xyz__iqr', 'xyz__kurtosis', 'xyz__skew'],
      dtype='object')

(200,)

0    100
1    100
Name: status, dtype: int64

# CV

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, log_loss

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

# First Run

In [5]:
n_est_list = [100, 200, 400]
models_and_params = [
    (DecisionTreeClassifier, {}),
    (KNeighborsClassifier, {}),
    
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBClassifier,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
metrics = [ roc_auc_score, log_loss ]

_ = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,DecisionTreeClassifier,,25,0.965,0.032,2,1.21,1.09,17,0.2
1,KNeighborsClassifier,,25,0.942,0.037,1,1.18,0.94,16,0.2
2,RandomForestClassifier,'n_estimators': 100,25,0.985,0.013,5,0.2,0.04,12,4.0
3,RandomForestClassifier,'n_estimators': 200,25,0.985,0.012,4,0.2,0.04,11,8.5
4,RandomForestClassifier,'n_estimators': 400,25,0.985,0.013,3,0.19,0.04,10,16.4
5,AdaBoostClassifier,'n_estimators': 100,25,0.99,0.015,6,0.34,0.06,13,5.1
6,AdaBoostClassifier,'n_estimators': 200,25,0.992,0.012,10,0.36,0.06,14,9.6
7,AdaBoostClassifier,'n_estimators': 400,25,0.993,0.01,16,0.36,0.08,15,19.4
8,LGBMClassifier,'n_estimators': 100,25,0.992,0.012,11,0.14,0.12,7,0.8
9,LGBMClassifier,'n_estimators': 200,25,0.992,0.01,14,0.18,0.17,8,1.2


# Shortlisted Best 3

In [6]:
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.996,0.008,5,0.34,0.098,5,15.2
1,AdaBoostClassifier,'n_estimators': 1600,10,0.996,0.005,6,0.343,0.102,6,30.4
2,LGBMClassifier,'n_estimators': 800,10,0.994,0.011,1,0.148,0.157,3,0.8
3,LGBMClassifier,'n_estimators': 1600,10,0.994,0.011,1,0.148,0.157,3,1.1
4,CatBoostClassifier,'n_estimators': 800,10,0.994,0.007,3,0.109,0.045,2,38.5
5,CatBoostClassifier,'n_estimators': 1600,10,0.994,0.007,3,0.107,0.046,1,83.9


In [7]:
def get_fe_df(list_of_list_of_models):
    df_all_fe = []
    
    for i, models_to_analyse in enumerate(list_of_list_of_models):
        feature_imp_for_model = []

        for j, m in enumerate(models_to_analyse[:3]):
            model_short_name = ''.join([l for l in m.__class__.__name__ if l.isupper()])

            if hasattr(m, 'feature_names_in_'):
                feature_names = m.feature_names_in_
            elif hasattr(m, 'feature_name_'):
                feature_names = m.feature_name_
            else:
                feature_names = m.feature_names_

            feature_imp_for_model.append(
                pd.Series(index = feature_names, data = m.feature_importances_, name=f'{model_short_name}_{i}_{j}')
            )

        df_f_imp = pd.concat(feature_imp_for_model, axis=1)
        df_f_imp[f'{model_short_name}_{i}_sum'] = df_f_imp.sum(axis=1)
        df_f_imp[f'{model_short_name}_{i}_rank'] = df_f_imp[f'{model_short_name}_{i}_sum'].rank().astype(int)

        # df_f_imp.sort_values(by=[f'{model_short_name}_sum'], ascending = False)
        df_all_fe.append(df_f_imp)

        df_final_fe = pd.concat(df_all_fe, axis=1)
        
        sum_cols = [col for col in df_final_fe.columns if col.endswith('_sum')]
    df_final_fe['sum'] = df_final_fe[sum_cols].sum(axis=1)
    df_final_fe = df_final_fe.sort_values(by='sum', ascending=False)


    rank_cols = [col for col in df_final_fe.columns if col.endswith('_rank')]
    styler = df_final_fe.style
    styler.background_gradient(subset=rank_cols, cmap=plt.cm.Oranges, vmin=-5)
    return styler

In [8]:
df_final_fe = get_fe_df(all_trained_models)
df_final_fe

Unnamed: 0,ABC_0_0,ABC_0_1,ABC_0_2,ABC_0_sum,ABC_0_rank,ABC_1_0,ABC_1_1,ABC_1_2,ABC_1_sum,ABC_1_rank,LGBMC_2_0,LGBMC_2_1,LGBMC_2_2,LGBMC_2_sum,LGBMC_2_rank,LGBMC_3_0,LGBMC_3_1,LGBMC_3_2,LGBMC_3_sum,LGBMC_3_rank,CBC_4_0,CBC_4_1,CBC_4_2,CBC_4_sum,CBC_4_rank,CBC_5_0,CBC_5_1,CBC_5_2,CBC_5_sum,CBC_5_rank,sum
xyz__amin,0.4525,0.095,0.0925,0.64,72,0.52625,0.1,0.081875,0.708125,72,157,122,109,388,72,157,122,109,388,72,39.974521,40.441696,33.740916,114.157133,72,42.381247,40.509898,33.448724,116.339869,72,1007.845127
y__sum,0.04375,0.02375,0.0275,0.095,66,0.05125,0.0275,0.02375,0.1025,66,97,52,42,191,71,97,52,42,191,71,3.771292,4.380575,1.605316,9.757183,70,4.671554,4.343752,1.761265,10.776572,70,402.731255
x__skew,0.0125,0.0325,0.03625,0.08125,64,0.00625,0.02875,0.041875,0.076875,64,59,81,45,185,70,59,81,45,185,70,3.755875,3.129409,2.475877,9.361161,69,3.59899,3.329452,2.803732,9.732175,69,389.251461
z__sum,0.0125,0.00625,0.03375,0.0525,58,0.00625,0.005,0.03,0.04125,54,107,33,41,181,69,107,33,41,181,69,3.424664,2.638825,1.926023,7.989512,67,2.985079,2.121427,1.972919,7.079425,66,377.162687
x__median,0.01125,0.0525,0.06375,0.1275,70,0.005625,0.053125,0.06375,0.1225,69,13,47,81,141,68,13,47,81,141,68,4.126702,5.049573,8.540702,17.716977,71,3.803265,5.33004,8.546567,17.679873,71,317.646849
z__p99,0.07875,0.01375,0.01875,0.11125,67,0.093125,0.011875,0.0175,0.1225,69,69,26,38,133,67,69,26,38,133,67,1.216849,0.916216,1.214445,3.34751,59,1.185553,1.071518,1.435967,3.693038,60,273.274298
z__skew,0.01875,0.05,0.055,0.12375,69,0.009375,0.046875,0.05125,0.1075,67,25,47,59,131,66,25,47,59,131,66,1.341391,1.620625,1.32615,4.288166,61,0.83135,1.769021,1.349434,3.949805,61,270.469222
xyz__p01,0.00125,0.0025,0.005,0.00875,15,0.000625,0.003125,0.004375,0.008125,15,70,14,41,125,65,70,14,41,125,65,0.863758,0.843966,0.874388,2.582113,51,0.644223,0.659393,0.903401,2.207017,47,254.806005
y__skew,0.01,0.055,0.05,0.115,68,0.005,0.05625,0.04875,0.11,68,11,41,46,98,64,11,41,46,98,64,0.42575,0.635427,0.679248,1.740426,36,0.391481,0.688404,0.564911,1.644796,36,199.610221
x__sum,0.0075,0.02875,0.03,0.06625,62,0.00375,0.025,0.030625,0.059375,61,9,42,34,85,63,9,42,34,85,63,1.504104,1.672305,2.928703,6.105111,64,1.064043,1.771486,2.79441,5.629939,64,181.860676


## Best 10 features

In [9]:
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:10]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.999,0.003,6,0.328,0.109,5,11.1
1,AdaBoostClassifier,'n_estimators': 1600,10,0.998,0.003,5,0.334,0.112,6,21.6
2,LGBMClassifier,'n_estimators': 800,10,0.998,0.002,3,0.08,0.098,3,1.1
3,LGBMClassifier,'n_estimators': 1600,10,0.998,0.002,3,0.081,0.098,4,1.5
4,CatBoostClassifier,'n_estimators': 800,10,0.996,0.006,1,0.07,0.038,2,27.2
5,CatBoostClassifier,'n_estimators': 1600,10,0.996,0.006,1,0.069,0.038,1,57.4


In [None]:
## Best 20 features

In [10]:
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:20]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.996,0.007,5,0.311,0.092,5,11.7
1,AdaBoostClassifier,'n_estimators': 1600,10,0.997,0.007,6,0.314,0.094,6,22.7
2,LGBMClassifier,'n_estimators': 800,10,0.996,0.006,3,0.117,0.139,3,0.8
3,LGBMClassifier,'n_estimators': 1600,10,0.996,0.006,1,0.117,0.14,4,1.4
4,CatBoostClassifier,'n_estimators': 800,10,0.996,0.006,3,0.079,0.037,2,30.2
5,CatBoostClassifier,'n_estimators': 1600,10,0.996,0.006,3,0.078,0.038,1,69.2


In [None]:
## Best 30 features

In [11]:
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X[df_final_fe.index.to_list()[:30]], y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(r)
)

all_trained_models = [res['models'] for res in r]

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,10,0.996,0.007,6,0.314,0.092,5,15.4
1,AdaBoostClassifier,'n_estimators': 1600,10,0.996,0.007,5,0.315,0.094,6,25.4
2,LGBMClassifier,'n_estimators': 800,10,0.996,0.006,3,0.112,0.134,3,0.7
3,LGBMClassifier,'n_estimators': 1600,10,0.996,0.006,3,0.112,0.134,3,1.1
4,CatBoostClassifier,'n_estimators': 800,10,0.995,0.007,1,0.088,0.041,2,35.2
5,CatBoostClassifier,'n_estimators': 1600,10,0.995,0.007,1,0.087,0.041,1,74.0
