# Setup and Imports

In [1]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [2]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [3]:
import time

from sklearn.preprocessing import StandardScaler, PowerTransformer

In [4]:
(X, y) = t.from_pkl(f'{processed_data_path}/xyz_distance.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 72)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__p01', 'x__p10', 'x__p20', 'x__p30', 'x__p99', 'x__p90',
       'x__p80', 'x__p70', 'x__iqr', 'x__kurtosis', 'x__skew', 'y__amin',
       'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std', 'y__var',
       'y__p01', 'y__p10', 'y__p20', 'y__p30', 'y__p99', 'y__p90', 'y__p80',
       'y__p70', 'y__iqr', 'y__kurtosis', 'y__skew', 'z__amin', 'z__amax',
       'z__sum', 'z__median', 'z__mean', 'z__std', 'z__var', 'z__p01',
       'z__p10', 'z__p20', 'z__p30', 'z__p99', 'z__p90', 'z__p80', 'z__p70',
       'z__iqr', 'z__kurtosis', 'z__skew', 'xyz__amin', 'xyz__amax',
       'xyz__sum', 'xyz__median', 'xyz__mean', 'xyz__std', 'xyz__var',
       'xyz__p01', 'xyz__p10', 'xyz__p20', 'xyz__p30', 'xyz__p99', 'xyz__p90',
       'xyz__p80', 'xyz__p70', 'xyz__iqr', 'xyz__kurtosis', 'xyz__skew'],
      dtype='object')

(200,)

0    100
1    100
Name: status, dtype: int64

# CV

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, log_loss

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

# First Run

In [6]:
n_est_list = [100, 400, 800]
models_and_params = [
    (DecisionTreeClassifier, {}),
    (KNeighborsClassifier, {}),
    
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBClassifier,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
metrics = [ roc_auc_score, log_loss ]

ret = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
7,AdaBoostClassifier,'n_estimators': 800,25,0.994,0.01,17,0.358,0.084,15,50.7
16,CatBoostClassifier,'n_estimators': 800,25,0.993,0.007,16,0.116,0.054,4,108.2
6,AdaBoostClassifier,'n_estimators': 400,25,0.993,0.01,15,0.358,0.078,14,25.0
10,LGBMClassifier,'n_estimators': 800,25,0.993,0.01,13,0.186,0.179,9,11.6
9,LGBMClassifier,'n_estimators': 400,25,0.993,0.01,13,0.186,0.179,8,9.7
15,CatBoostClassifier,'n_estimators': 400,25,0.992,0.009,12,0.119,0.055,5,66.2
14,CatBoostClassifier,'n_estimators': 100,25,0.992,0.01,11,0.125,0.053,6,19.5
8,LGBMClassifier,'n_estimators': 100,25,0.992,0.012,10,0.14,0.117,7,7.3
11,XGBClassifier,'n_estimators': 100,25,0.99,0.01,8,0.115,0.069,1,249.0
12,XGBClassifier,'n_estimators': 400,25,0.99,0.01,8,0.115,0.069,2,262.5


# Shortlisted Best 3

In [7]:
trained_models = []
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
1,AdaBoostClassifier,'n_estimators': 1600,50,0.994,0.008,6,0.352,0.097,5,148.0
0,AdaBoostClassifier,'n_estimators': 800,50,0.993,0.009,5,0.352,0.091,6,75.9
5,CatBoostClassifier,'n_estimators': 1600,50,0.992,0.01,4,0.118,0.064,1,540.4
4,CatBoostClassifier,'n_estimators': 800,50,0.992,0.01,3,0.119,0.061,2,201.6
2,LGBMClassifier,'n_estimators': 800,50,0.992,0.012,1,0.199,0.206,4,3.9
3,LGBMClassifier,'n_estimators': 1600,50,0.992,0.012,1,0.199,0.206,3,4.9


In [8]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,ABC_1_rank,LGBMC_2_rank,LGBMC_3_rank,CBC_4_rank,CBC_5_rank,sum_rank
xyz__amin,72,72,72,72,72,72,432
x__median,69,69,68,68,71,71,416
y__sum,66,66,70,70,70,70,412
x__skew,67,65,69,69,69,69,408
z__skew,71,70,67,67,59,60,394
y__mean,68,68,59,59,68,68,390
z__mean,63,63,63,62,67,67,385
z__p99,65,67,65,64,62,62,385
z__sum,54,54,71,71,65,66,381
x__sum,64,64,61,61,64,64,378


## Remove worst 10 features

In [9]:
features_to_drop = df_final_fe.index.to_list()[-10:]
features_to_drop

['x__p99',
 'x__p10',
 'xyz__std',
 'y__var',
 'z__p30',
 'z__var',
 'xyz__p90',
 'xyz__var',
 'z__p20',
 'x__var']

In [10]:
trained_models = []
n_est_list = [800]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,50,0.994,0.009,3,0.347,0.096,3,78.4
2,CatBoostClassifier,'n_estimators': 800,50,0.993,0.009,2,0.117,0.061,1,352.5
1,LGBMClassifier,'n_estimators': 800,50,0.992,0.011,1,0.193,0.2,2,17.6


In [11]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
xyz__amin,62,62,62,186
y__sum,58,61,60,179
x__median,59,58,61,178
x__skew,55,59,59,173
z__skew,60,57,50,167
z__mean,54,53,57,164
z__p99,57,55,52,164
y__mean,56,50,58,164
z__sum,44,60,56,160
x__sum,53,52,53,158


# Second 10 Worst Features

In [12]:
features_to_drop_2 = df_final_fe.index.to_list()[-10:]
features_to_drop_2
features_to_drop.extend(features_to_drop_2)

['y__p30',
 'x__p01',
 'xyz__p80',
 'z__p01',
 'x__std',
 'y__std',
 'xyz__amax',
 'x__p90',
 'xyz__p70',
 'z__iqr']

In [13]:
trained_models = []
n_est_list = [800]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,50,0.994,0.009,3,0.342,0.097,3,117.3
2,CatBoostClassifier,'n_estimators': 800,50,0.993,0.009,2,0.113,0.059,1,334.9
1,LGBMClassifier,'n_estimators': 800,50,0.993,0.01,1,0.185,0.184,2,999.2


In [14]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
xyz__amin,52,52,52,156
x__median,49,48,51,148
y__sum,46,51,50,147
x__skew,45,49,48,142
z__skew,50,47,39,136
z__p99,48,46,42,136
y__mean,47,36,49,132
z__mean,44,42,45,131
x__sum,43,43,43,129
z__sum,33,50,44,127


# Third 10 worst features

In [15]:
features_to_drop_3 = df_final_fe.index.to_list()[-10:]
features_to_drop_3
features_to_drop.extend(features_to_drop_3)

['z__std',
 'x__iqr',
 'xyz__iqr',
 'xyz__skew',
 'z__amin',
 'z__p80',
 'y__p01',
 'xyz__sum',
 'y__p99',
 'y__amax']

In [16]:
trained_models = []
n_est_list = [800]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
0,AdaBoostClassifier,'n_estimators': 800,50,0.995,0.007,3,0.332,0.099,3,130.1
2,CatBoostClassifier,'n_estimators': 800,50,0.993,0.009,2,0.108,0.06,1,283.6
1,LGBMClassifier,'n_estimators': 800,50,0.993,0.01,1,0.185,0.195,2,977.4


In [17]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
xyz__amin,42,42,42,126
y__sum,38,41,40,119
x__median,39,36,41,116
x__skew,35,39,38,112
z__p99,40,37,32,109
z__skew,37,38,28,103
y__mean,36,28,39,103
x__sum,34,33,33,100
z__mean,32,30,36,98
z__sum,23,40,34,97
