# Setup and Imports

In [1]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [2]:
import sys
if '../Toolkit' not in sys.path: sys.path.append('../Toolkit')

%load_ext autoreload
%autoreload 1

%aimport tools
%aimport models

import tools as t
import models as m

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113
import matplotlib.pyplot as plt
import seaborn as sns

RS = 35577 # global random state seed
raw_data_path = 'data-raw'
processed_data_path = 'data-processed'

In [3]:
import time

from sklearn.preprocessing import StandardScaler, PowerTransformer

In [22]:
(X, y) = t.from_pkl(f'{processed_data_path}/b1_b2_concat_ax0.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 54)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__p01', 'x__p10', 'x__p20', 'x__p30', 'x__p99', 'x__p90',
       'x__p80', 'x__p70', 'x__iqr', 'x__kurtosis', 'x__skew', 'y__amin',
       'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std', 'y__var',
       'y__p01', 'y__p10', 'y__p20', 'y__p30', 'y__p99', 'y__p90', 'y__p80',
       'y__p70', 'y__iqr', 'y__kurtosis', 'y__skew', 'z__amin', 'z__amax',
       'z__sum', 'z__median', 'z__mean', 'z__std', 'z__var', 'z__p01',
       'z__p10', 'z__p20', 'z__p30', 'z__p99', 'z__p90', 'z__p80', 'z__p70',
       'z__iqr', 'z__kurtosis', 'z__skew'],
      dtype='object')

(200,)

0    100
1    100
Name: status, dtype: int64

# CV

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score, log_loss

from sklearn.model_selection import KFold, RepeatedKFold, StratifiedKFold, RepeatedStratifiedKFold

# First Run

In [7]:
n_est_list = [100, 400, 800]
models_and_params = [
    (DecisionTreeClassifier, {}),
    (KNeighborsClassifier, {}),
    
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBClassifier,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5)
metrics = [ roc_auc_score, log_loss ]

ret = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
16,CatBoostClassifier,'n_estimators': 800,25,0.987,0.016,17,0.165,0.076,1,89.3
15,CatBoostClassifier,'n_estimators': 400,25,0.986,0.016,16,0.168,0.075,2,44.3
7,AdaBoostClassifier,'n_estimators': 800,25,0.985,0.02,15,0.441,0.023,15,32.6
14,CatBoostClassifier,'n_estimators': 100,25,0.985,0.015,14,0.172,0.071,3,11.2
8,LGBMClassifier,'n_estimators': 100,25,0.984,0.018,13,0.192,0.151,4,0.7
9,LGBMClassifier,'n_estimators': 400,25,0.984,0.02,11,0.286,0.256,11,1.3
10,LGBMClassifier,'n_estimators': 800,25,0.984,0.02,11,0.286,0.256,11,1.5
6,AdaBoostClassifier,'n_estimators': 400,25,0.984,0.019,10,0.436,0.025,14,16.2
4,RandomForestClassifier,'n_estimators': 800,25,0.983,0.016,9,0.201,0.044,5,27.8
2,RandomForestClassifier,'n_estimators': 100,25,0.983,0.015,8,0.202,0.045,7,3.6


# Shortlisted Best 3

In [8]:
trained_models = []
n_est_list = [800, 1600]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X, y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
1,AdaBoostClassifier,'n_estimators': 1600,50,0.986,0.017,6,0.445,0.026,6,130.1
0,AdaBoostClassifier,'n_estimators': 800,50,0.986,0.018,5,0.442,0.026,5,66.2
5,CatBoostClassifier,'n_estimators': 1600,50,0.986,0.015,4,0.163,0.076,1,362.8
4,CatBoostClassifier,'n_estimators': 800,50,0.986,0.015,3,0.164,0.074,2,174.3
2,LGBMClassifier,'n_estimators': 800,50,0.985,0.017,1,0.278,0.23,3,3.2
3,LGBMClassifier,'n_estimators': 1600,50,0.985,0.017,1,0.278,0.23,3,4.9


In [31]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
x__skew,42,44,43,129
x__median,41,42,44,127
z__skew,43,40,35,118
x__sum,40,37,40,117
z__p99,39,41,37,117
z__mean,37,33,39,109
z__amax,31,34,42,107
z__sum,25,43,38,106
y__sum,33,39,29,101
x__mean,27,31,41,99


## Remove worst 10 features

In [33]:
features_to_drop = df_final_fe.index.to_list()[-10:]
features_to_drop

['x__p01',
 'z__std',
 'x__p90',
 'x__iqr',
 'x__amin',
 'z__p30',
 'x__p99',
 'z__p80',
 'y__p99',
 'x__std']

In [34]:
trained_models = []
n_est_list = [800]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
2,CatBoostClassifier,'n_estimators': 800,50,0.988,0.013,3,0.156,0.073,1,187.6
1,LGBMClassifier,'n_estimators': 800,50,0.986,0.016,2,0.271,0.217,2,4.5
0,AdaBoostClassifier,'n_estimators': 800,50,0.985,0.019,1,0.436,0.026,3,66.7


In [35]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
x__median,42,42,44,128
x__skew,41,44,43,128
z__skew,43,40,35,118
x__sum,39,37,40,116
z__p99,37,39,36,112
z__mean,36,33,39,108
z__sum,25,43,38,106
z__amax,30,34,42,106
y__sum,34,41,29,104
x__mean,26,31,41,98


# Second 10 Worst Features

In [37]:
features_to_drop_2 = df_final_fe.index.to_list()[-10:]
features_to_drop_2
features_to_drop.extend(features_to_drop_2)

['z__p90',
 'z__p10',
 'y__amax',
 'z__p01',
 'y__std',
 'x__p10',
 'x__var',
 'y__var',
 'z__iqr',
 'z__p20']

In [38]:
trained_models = []
n_est_list = [800]
models_and_params = [
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
metrics = [ roc_auc_score, log_loss ]

r = t.grid_exec(
    lambda model: m.cv_classification(model, cv, X.drop(columns=features_to_drop), y, metrics, trained_models), 
    models_and_params, 
    on1Completed = lambda r: m.display_stats(pd.DataFrame(r).sort_values(by=['ROC_AUC_mean'], ascending=False))
)

Unnamed: 0,model,params,n_folds,ROC_AUC_mean,ROC_AUC_std,#1,log_loss_mean,log_loss_std,#2,time
2,CatBoostClassifier,'n_estimators': 800,50,0.989,0.013,3,0.151,0.071,1,165.1
0,AdaBoostClassifier,'n_estimators': 800,50,0.988,0.017,2,0.43,0.025,3,65.5
1,LGBMClassifier,'n_estimators': 800,50,0.987,0.015,1,0.263,0.215,2,3.8


In [39]:
df_final_fe = m.get_fe_df(trained_models)
df_final_fe

Unnamed: 0,ABC_0_rank,LGBMC_1_rank,CBC_2_rank,sum_rank
x__skew,32,34,33,99
x__median,31,30,34,95
z__p99,29,32,31,92
z__skew,33,29,24,86
x__sum,28,27,29,84
z__sum,14,33,27,74
z__mean,24,21,28,73
z__amax,17,24,32,73
y__sum,19,31,21,71
y__mean,30,23,18,71
