# Setup

In [1]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [2]:
%load_ext autoreload
%autoreload 1
%aimport transformations

%aimport transformations
%aimport f

transformations loaded, version: 0.1
framework loaded, version: 0.1


In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import time

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113

import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import StandardScaler, PowerTransformer

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

RS = 35577 # global random state seed
data_path = './data-raw/'

In [4]:
(X, y) = f.from_pkl(f'data-processed/all_b1_b2.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 36)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__q1', 'x__q3', 'x__iqr', 'x__kurtosis', 'x__skew',
       'y__amin', 'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std',
       'y__var', 'y__q1', 'y__q3', 'y__iqr', 'y__kurtosis', 'y__skew',
       'z__amin', 'z__amax', 'z__sum', 'z__median', 'z__mean', 'z__std',
       'z__var', 'z__q1', 'z__q3', 'z__iqr', 'z__kurtosis', 'z__skew'],
      dtype='object')

(200,)

1    100
0    100
Name: status, dtype: int64

# CV

In [5]:
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score

In [6]:
import itertools

def grid_exec(func, *args):
    return [func(*element) for element in itertools.product(*args)]

def grid_exec_callback(func, callback, *args):
    return [ callback(func(*element)) for element in itertools.product(*args)]

In [7]:
all_trained_models = []

def cross_val(model_and_params, X, y, n_folds = 5, n_fold_repeats = 1, n_model_repeats = 1):
    np.random.seed(RS)
    get_random = lambda  : np.random.randint(1, 2**16)
    cv_scores = []
    n_folds_completed = 0
    
    start_time = time.perf_counter()
    
    cv = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_fold_repeats, random_state=get_random())
    for i_fold, (idx_train, idx_test) in enumerate(cv.split(X, y)):
        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_test, y_test = X.iloc[idx_test], y.iloc[idx_test]

        model_scores = []
        cv_scores.append(model_scores)
        n_folds_completed += 1

        for i_model in range(n_model_repeats):
            constructor, params_dic = model_and_params
            
            if 'random_state' in params_dic.keys():
                params_dic['random_state'] = get_random()
            
            model = constructor(**params_dic)
            all_trained_models.append(model)
            
            model.fit(X_train, y_train)
            
            y_pred = model.predict_proba(X_test)[:, 1]
            score = roc_auc_score(y_test, y_pred)
            model_scores.append(score)

        model_name = model.__class__.__name__
        model_params = params_dic.copy()
        
    total_elapsed_time = time.perf_counter() - start_time
    return model_name, model_params, n_folds_completed, n_model_repeats, total_elapsed_time, np.array(cv_scores)

In [8]:
def get_stats_df(cv_results):
    ret_list = []
    
    for result in cv_results:
        model_params = result[1]
        model_params.pop('random_state', None)
        model_params.pop('silent', None)
        
        stats_ = result[5]
        
        ret_list.append(
            {
                'model' : result[0],
                # '#' : None,
                # '#1' : None,
                # '#2' : None,
                # '#3' : None, 
                'params' : str(model_params).strip('{').strip('}'),
                'n_folds' : result[2],    
                'n_model_repeats' : result[3],

                'min': stats_.min(),
                'mean': stats_.mean(),
                'max': stats_.max(),
                'std': stats_.std(),

                'folds_min': stats_.mean(axis=1).min(),
                'folds_mean': stats_.mean(axis=1).mean(),
                'folds_max': stats_.mean(axis=1).max(),
                'folds_std': stats_.mean(axis=1).std(),
                
                'time': round(result[4], 1), 
            })
    df = pd.DataFrame(ret_list)
    
    return df.sort_values(by=['mean', 'std'], ascending=[False, True]).reset_index(drop=True)

def display_stats(df_stats):
    styler = df_stats.style
    
    styler.format('{:,.1f}', 'time')
    styler.bar(subset='time')
    
    styler.background_gradient(cmap='Oranges', subset=[c for c in df_stats.columns if c[0] == '#'])
    
    styler.format('{:,.3f}', ['min', 'mean', 'max', 'std', 'folds_min', 'folds_mean', 'folds_max', 'folds_std'])
    
    for c in ['min', 'mean', 'max', 'std']:
        styler.background_gradient(cmap='Blues', subset=c, gmap=df_stats[c].rank())
        
    for c in ['folds_min', 'folds_mean', 'folds_max', 'folds_std']:
        styler.background_gradient(cmap='Greens', subset=c, gmap=df_stats[c].rank())
    
    display(styler)
    # return styler
    
# def display_stats_v1(df_stats):
#     display(df_stats.style\
#             .format('{:,.3f}', df_stats.columns[5:])\
#             .format('{:,.1f}', df_stats.columns[-1])\
#             .background_gradient(cmap='Reds', subset=['#1', '#2'], axis=0)\
#             .background_gradient(cmap='Blues', subset=df_stats.columns[6:10], axis=0)\
#             .background_gradient(cmap='Greens', subset=df_stats.columns[10:-1], axis=0)\
#            )

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## First Run

In [10]:
all_results = []

def on_1_completed(results):
    all_results.append(results)
    clear_output(wait=True)
    get_stats_df(all_results)
    display_stats(get_stats_df(all_results))

grid_func = lambda factory, n_folds, n_fold_repeats, n_model_repeats: cross_val(factory, X, y, n_folds, n_fold_repeats, n_model_repeats)

In [11]:
n_est_list = [50, 200, 400]

all_models = [
    (DecisionTreeClassifier, {}),
    (KNeighborsClassifier, {}),
    
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (XGBClassifier,          {'random_state': RS, 'n_estimators': n_e}) for n_e in n_est_list ],
    *[ (CatBoostClassifier,     {'random_state': RS, 'n_estimators': n_e, 'silent': True}) for n_e in n_est_list ],
]

n_folds_list = [5]
n_fold_repeats_list = [3]
n_model_repeats_list = [10]

_ = grid_exec_callback(grid_func, on_1_completed, 
                       all_models, n_folds_list, n_fold_repeats_list, n_model_repeats_list)

Unnamed: 0,model,params,n_folds,n_model_repeats,min,mean,max,std,folds_min,folds_mean,folds_max,folds_std,time
0,AdaBoostClassifier,'n_estimators': 400,15,10,0.945,0.989,1.0,0.013,0.949,0.989,1.0,0.013,97.7
1,CatBoostClassifier,'n_estimators': 400,15,10,0.955,0.988,1.0,0.012,0.962,0.989,1.0,0.012,312.1
2,CatBoostClassifier,'n_estimators': 200,15,10,0.955,0.988,1.0,0.013,0.96,0.988,1.0,0.013,162.9
3,LGBMClassifier,'n_estimators': 400,15,10,0.958,0.988,1.0,0.013,0.957,0.988,1.0,0.013,8.4
4,LGBMClassifier,'n_estimators': 200,15,10,0.96,0.988,1.0,0.012,0.96,0.988,1.0,0.012,5.9
5,CatBoostClassifier,'n_estimators': 50,15,10,0.955,0.988,1.0,0.013,0.964,0.988,1.0,0.012,41.3
6,RandomForestClassifier,'n_estimators': 400,15,10,0.95,0.987,1.0,0.014,0.957,0.987,1.0,0.014,90.2
7,AdaBoostClassifier,'n_estimators': 200,15,10,0.953,0.987,1.0,0.014,0.956,0.987,1.0,0.014,49.2
8,RandomForestClassifier,'n_estimators': 200,15,10,0.95,0.986,1.0,0.014,0.957,0.986,1.0,0.014,45.4
9,RandomForestClassifier,'n_estimators': 50,15,10,0.949,0.985,1.0,0.016,0.957,0.985,1.0,0.015,11.8


## Shortlist models

In [12]:
all_results2 = []

def on_1_completed(results):
    all_results2.append(results)
    clear_output(wait=True)
    display_stats(get_stats_df(all_results2))

grid_func = lambda factory, n_folds, n_fold_repeats, n_model_repeats: cross_val(factory, X, y, n_folds, n_fold_repeats, n_model_repeats)

In [None]:
all_models = [
    *[ (RandomForestClassifier, {'random_state': RS, 'n_estimators': n_e}) for n_e in [800] ],
    *[ (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': n_e}) for n_e in [400, 800] ],
    *[ (LGBMClassifier,         {'random_state': RS, 'n_estimators': n_e}) for n_e in [400, 800] ],
]

n_folds_list = [5]
n_fold_repeats_list = [3]
n_model_repeats_list = [10]

_ = grid_exec_callback(grid_func, on_1_completed, 
                       all_models, n_folds_list, n_fold_repeats_list, n_model_repeats_list)

## Best models for Featire Selection

In [None]:
all_trained_models.clear()
all_results3 = []

def on_1_completed(results):
    all_results3.append(results)
    clear_output(wait=True)
    display_stats(get_stats_df(all_results3))

grid_func = lambda factory, n_folds, n_fold_repeats, n_model_repeats: cross_val(factory, X, y, n_folds, n_fold_repeats, n_model_repeats)

In [None]:
all_models = [
    (AdaBoostClassifier,     {'random_state': RS, 'n_estimators': 400}),
    (LGBMClassifier,         {'random_state': RS, 'n_estimators': 400}),
]

n_folds_list = [5]
n_fold_repeats_list = [1]
n_model_repeats_list = [1]

_ = grid_exec_callback(grid_func, on_1_completed, 
                       all_models, n_folds_list, n_fold_repeats_list, n_model_repeats_list)

In [None]:
models_to_analyse = all_trained_models#[::5]
models_to_analyse

In [None]:
# all_f_imp = []

# n_folds = 5
# n_models = len(all_trained_models[::5])


# for i, m in enumerate(models_to_analyse):
#     if 'feature_names_in_' in m.__dict__.keys():
#         all_f_imp.append(
#             pd.Series(data = m.feature_importances_, index = m.feature_names_in_, name=f'{m.__class__.__name__[0]}_{i}')#.rank().astype(int)
#         )
#     else:
#         all_f_imp.append(
#             pd.Series(data = m.feature_importances_, index = m.feature_name_, name=f'{m.__class__.__name__[0]}_{i}')
#         )

# df_f_imp = pd.concat(all_f_imp, axis=1)
# df_f_imp


In [None]:


# fe = pd.concat(all_f_imp, axis=1)
# sfor s_col in range(1,3):
#     fe.insert(5, 'sum_1', None)
#     fe.insert(11, 'sum_2', None)
#     fe.insert(17, 'sum_3', None)
#     fe['sum_1'] = fe.iloc[:, :5].sum(axis=1)
#     fe['sum_2'] = fe.iloc[:, 5:10].sum(axis=1)
#     fe['sum_3'] = fe.iloc[:, 10:].sum(axis=1)


# fe.sort_values(by=['sum_1', 'sum_2', 'sum_3'], ascending=False)\
#     .round(2)\
#     .style\
#     .format('{:.2f}')\
#     .highlight_max(color='green')\
#     .highlight_min(color='blue')\
#     .highlight_max(color='red', subset=['sum_1', 'sum_2', 'sum_3'])

In [None]:
# fe_s = fe.sort_values(by=['sum_1', 'sum_2', 'sum_3'], ascending=False).round(2).style
# _ = fe_s.format('{:.2f}')
# for c in fe.columns:
#     _ = fe_s.background_gradient(cmap='Greens', subset=c, vmin=-5, gmap=fe[c].rank(), )
    
# for c in ['sum_1', 'sum_2', 'sum_3']:
#     _ = fe_s.background_gradient(cmap='Reds', subset=c, vmin=-50, gmap=fe[c].rank(), )
    
# fe_s
#     # .highlight_max(color='green')\
#     # .highlight_min(color='blue')\

#     # .highlight_max(color='red', subset=['sum_1', 'sum_2', 'sum_3'])