# Setup and Imports

In [1]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [2]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [3]:
import time

from sklearn.preprocessing import StandardScaler, PowerTransformer

In [4]:
(X, y) = t.from_pkl(f'{processed_data_path}/xyz_distance_cluster_rpm.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 72)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__p01', 'x__p10', 'x__p20', 'x__p30', 'x__p99', 'x__p90',
       'x__p80', 'x__p70', 'x__iqr', 'x__kurtosis', 'x__skew', 'y__amin',
       'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std', 'y__var',
       'y__p01', 'y__p10', 'y__p20', 'y__p30', 'y__p99', 'y__p90', 'y__p80',
       'y__p70', 'y__iqr', 'y__kurtosis', 'y__skew', 'z__amin', 'z__amax',
       'z__sum', 'z__median', 'z__mean', 'z__std', 'z__var', 'z__p01',
       'z__p10', 'z__p20', 'z__p30', 'z__p99', 'z__p90', 'z__p80', 'z__p70',
       'z__iqr', 'z__kurtosis', 'z__skew', 'xyz__amin', 'xyz__amax',
       'xyz__sum', 'xyz__median', 'xyz__mean', 'xyz__std', 'xyz__var',
       'xyz__p01', 'xyz__p10', 'xyz__p20', 'xyz__p30', 'xyz__p99', 'xyz__p90',
       'xyz__p80', 'xyz__p70', 'xyz__iqr', 'xyz__kurtosis', 'xyz__skew'],
      dtype='object')

(200,)

0    100
1    100
Name: status, dtype: int64

# CV

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, log_loss

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

# First Run

In [6]:
n_est_list = [100, 400, 800]
models_and_params = [
    (DecisionTreeClassifier, {}),
    (KNeighborsClassifier, {}),
    
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBClassifier,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
metrics = [ roc_auc_score, log_loss ]

ret = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
7,AdaBoostClassifier,'n_estimators': 800,25,0.991,0.016,17,0.472,0.016,15,61.6
6,AdaBoostClassifier,'n_estimators': 400,25,0.99,0.017,16,0.466,0.017,14,31.4
10,LGBMClassifier,'n_estimators': 800,25,0.988,0.016,14,0.247,0.247,11,61.6
9,LGBMClassifier,'n_estimators': 400,25,0.988,0.016,14,0.247,0.247,12,44.9
5,AdaBoostClassifier,'n_estimators': 100,25,0.986,0.017,13,0.44,0.022,13,7.4
8,LGBMClassifier,'n_estimators': 100,25,0.986,0.02,12,0.18,0.148,3,31.7
15,CatBoostClassifier,'n_estimators': 400,25,0.986,0.013,11,0.178,0.065,2,87.2
16,CatBoostClassifier,'n_estimators': 800,25,0.985,0.013,10,0.177,0.066,1,118.5
14,CatBoostClassifier,'n_estimators': 100,25,0.984,0.015,9,0.185,0.061,4,26.7
4,RandomForestClassifier,'n_estimators': 800,25,0.975,0.02,8,0.235,0.04,8,37.8


# Shortlisted Best 3

In [7]:
trained_models = []
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
1,AdaBoostClassifier,'n_estimators': 1600,50,0.993,0.013,6,0.475,0.02,6,156.3
0,AdaBoostClassifier,'n_estimators': 800,50,0.992,0.012,5,0.472,0.021,5,80.4
2,LGBMClassifier,'n_estimators': 800,50,0.988,0.014,3,0.245,0.21,3,4.1
3,LGBMClassifier,'n_estimators': 1600,50,0.988,0.014,3,0.245,0.21,4,6.1
5,CatBoostClassifier,'n_estimators': 1600,50,0.986,0.012,2,0.171,0.065,1,473.1
4,CatBoostClassifier,'n_estimators': 800,50,0.985,0.013,1,0.174,0.064,2,227.2


In [8]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,ABC_1_rank,LGBMC_2_rank,LGBMC_3_rank,CBC_4_rank,CBC_5_rank,sum_rank
x__skew,71,71,72,72,71,71,428
xyz__amin,68,68,70,70,72,72,420
x__sum,72,72,68,68,70,70,420
z__skew,70,70,71,71,64,65,411
z__sum,67,67,69,69,68,68,408
x__kurtosis,61,60,67,67,69,69,393
y__skew,69,69,66,66,57,57,384
z__p99,62,62,64,64,65,64,381
z__mean,63,63,61,60,66,67,380
xyz__kurtosis,65,65,63,63,56,56,368


## Remove worst 10 features

In [9]:
features_to_drop = df_final_fe.index.to_list()[-10:]
features_to_drop

['y__var',
 'xyz__std',
 'x__var',
 'xyz__median',
 'xyz__mean',
 'xyz__p90',
 'x__p01',
 'xyz__p70',
 'xyz__p80',
 'xyz__var']

In [10]:
trained_models = []
n_est_list = [800]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,50,0.993,0.011,3,0.472,0.02,3,77.1
1,LGBMClassifier,'n_estimators': 800,50,0.987,0.015,2,0.246,0.219,2,4.0
2,CatBoostClassifier,'n_estimators': 800,50,0.986,0.012,1,0.169,0.063,1,233.3


In [11]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
x__skew,61,62,61,184
x__sum,62,58,60,180
xyz__amin,58,59,62,179
z__skew,60,61,54,175
z__sum,57,60,58,175
x__kurtosis,49,57,59,165
y__skew,59,56,47,162
z__p99,53,54,55,162
z__mean,52,50,57,159
y__sum,51,55,50,156


# Second 10 Worst Features

In [12]:
features_to_drop_2 = df_final_fe.index.to_list()[-10:]
features_to_drop_2
features_to_drop.extend(features_to_drop_2)

['z__p20',
 'y__p20',
 'x__p20',
 'x__p80',
 'xyz__p10',
 'y__amax',
 'xyz__amax',
 'z__p30',
 'x__p99',
 'x__p90']

In [13]:
trained_models = []
n_est_list = [800]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,50,0.994,0.011,3,0.47,0.023,3,72.4
1,LGBMClassifier,'n_estimators': 800,50,0.988,0.015,2,0.242,0.217,2,4.2
2,CatBoostClassifier,'n_estimators': 800,50,0.987,0.012,1,0.163,0.061,1,259.7


In [14]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
x__skew,51,52,51,154
x__sum,52,48,50,150
xyz__amin,48,50,52,150
z__skew,50,51,44,145
z__sum,46,49,48,143
x__kurtosis,38,47,49,134
z__p99,43,44,45,132
y__skew,49,46,37,132
z__mean,42,43,46,131
y__sum,41,45,40,126


# Third 10 worst features

In [15]:
features_to_drop_3 = df_final_fe.index.to_list()[-10:]
features_to_drop_3
features_to_drop.extend(features_to_drop_3)

['x__std',
 'y__median',
 'z__var',
 'xyz__sum',
 'y__iqr',
 'xyz__p20',
 'xyz__p30',
 'y__p99',
 'xyz__p01',
 'z__p10']

In [16]:
trained_models = []
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
1,AdaBoostClassifier,'n_estimators': 1600,50,0.994,0.009,6,0.467,0.019,6,196.5
0,AdaBoostClassifier,'n_estimators': 800,50,0.994,0.01,5,0.465,0.019,5,68.1
5,CatBoostClassifier,'n_estimators': 1600,50,0.989,0.011,4,0.152,0.058,1,782.5
2,LGBMClassifier,'n_estimators': 800,50,0.989,0.014,2,0.236,0.209,4,10.1
3,LGBMClassifier,'n_estimators': 1600,50,0.989,0.014,2,0.236,0.209,3,8.3
4,CatBoostClassifier,'n_estimators': 800,50,0.989,0.011,1,0.154,0.058,2,288.9


In [17]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,ABC_1_rank,LGBMC_2_rank,LGBMC_3_rank,CBC_4_rank,CBC_5_rank,sum_rank
x__skew,41,41,42,42,41,41,248
x__sum,42,42,38,38,40,40,240
xyz__amin,37,37,40,40,42,42,238
z__skew,40,40,41,41,35,34,231
z__sum,35,35,39,39,38,38,224
z__p99,33,33,35,37,36,36,210
x__kurtosis,29,28,37,36,39,39,208
y__skew,39,39,36,35,27,27,203
z__mean,32,32,29,28,34,35,190
y__sum,28,30,34,34,29,30,185
