In [3]:
!pip install xgboost
!pip install lightgbm
!pip install catboost



In [1]:
%load_ext autoreload
%autoreload 1
%aimport transformations

%aimport transformations
%aimport f

transformations loaded, version: 0.1
framework loaded, version: 0.1


In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113

import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import StandardScaler, PowerTransformer

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

RS = 35577 # global random state seed
data_path = './data-raw/'

In [3]:
(X, y) = f.from_pkl(f'data-processed/all_b1_b2.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 36)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__q1', 'x__q3', 'x__iqr', 'x__kurtosis', 'x__skew',
       'y__amin', 'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std',
       'y__var', 'y__q1', 'y__q3', 'y__iqr', 'y__kurtosis', 'y__skew',
       'z__amin', 'z__amax', 'z__sum', 'z__median', 'z__mean', 'z__std',
       'z__var', 'z__q1', 'z__q3', 'z__iqr', 'z__kurtosis', 'z__skew'],
      dtype='object')

(200,)

1    100
0    100
Name: status, dtype: int64

# CV

In [4]:
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score

In [5]:
import itertools

def grid_exec(func, *args):
    return [func(*element) for element in itertools.product(*args)]

def grid_exec_callback(func, callback, *args):
    return [ callback(func(*element)) for element in itertools.product(*args)]

In [6]:
all_trained_models = []

def cross_val(model_factory, X, y, cv_repeat_n = 1, model_repeat_n = 1, folds_n = 7):
    np.random.seed(RS)
    get_random = lambda  : np.random.randint(1, 10000)
    cv_scores = []
    
    for i_cv in range(cv_repeat_n):
        fold_scores = []
        cv_scores.append(fold_scores)
        
        for i_fold, (idx_train, idx_test) in enumerate(StratifiedKFold(n_splits=folds_n, shuffle=True, random_state=get_random()).split(X, y, y)):
            
            X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
            X_test, y_test = X.iloc[idx_test], y.iloc[idx_test]
            
            model_scores = []
            fold_scores.append(model_scores)

            for i_model in range(model_repeat_n):
                rand = get_random()
                constructor, params_dic = model_factory(rand)
                
                model = constructor(**params_dic)
                # model = model_factory(get_random())
                model.fit(X_train, y_train)
                
                all_trained_models.append(model)

                y_pred = model.predict_proba(X_test)[:, 1]
                score = roc_auc_score(y_test, y_pred)
                model_scores.append(score)
                
            model_name = model.__class__.__name__
            model_params = params_dic.copy()
                
    return model_name, model_params, cv_repeat_n, folds_n, model_repeat_n, np.array(cv_scores), model

In [7]:
def get_stats_df(cv_results):
    
    ret_list = []
    
    for result in cv_results:
        
        model_params = result[1]
        model_params.pop('random_state', None)
        stats_ = result[5]
        
        ret_list.append(
            {
                'model' : result[0],
                # '#' : None,
                '##' : None,
                'params' : str(model_params).strip('{').strip('}'),
                'cv_repeat_n' : result[2],
                'folds_n' : result[3],    
                'model_repeat_n' : result[4],

                'min': stats_.min(),
                'mean': stats_.mean(),
                'max': stats_.max(),
                'std': stats_.std(),

                'folds_mean_min': stats_.mean(axis=2).reshape(1,-1).min(),
                # 'folds_mean_mean': stats_.mean(axis=2).reshape(1,-1).mean(),
                'folds_mean_max': stats_.mean(axis=2).reshape(1,-1).max(),
                'folds_mean_std': stats_.mean(axis=2).reshape(1,-1).std(),
            })
    df = pd.DataFrame(ret_list)
    
    # df['#'] = df.groupby('model', group_keys=False).apply(
    #             lambda df: (df['mean'].rank(ascending = False) \
    #                       + df['min'].rank(ascending = False) \
    #                       + df['folds_mean_std'].rank(ascending = True)
    #                        ).rank()).astype(int)
    
    df['##'] = (df['mean'].rank(ascending = False) \
                      + df['min'].rank(ascending = False) \
                      + df['folds_mean_std'].rank(ascending = True)
                       ).rank().astype(int)
    
    return df

def display_stats(df_stats):
    display(df_stats.style.format('{:,.3f}', df_stats.columns[6:])\
        # .background_gradient(cmap='Oranges', subset='#', axis=0)\
        .background_gradient(cmap='Reds', subset='##', axis=0)\
        .background_gradient(cmap='Blues', subset=df_stats.columns[6:10], axis=0)\
        .background_gradient(cmap='Greens', subset=df_stats.columns[10:], axis=0))

## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

## First Run

In [None]:
KNeighborsClassifier

In [None]:
all_models = [
    lambda random_state: (DecisionTreeClassifier, {}),
    lambda random_state: (KNeighborsClassifier,   {}),

    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 600}),
    
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 500}),
    
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 500}),
    
    lambda random_state: (XGBClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (XGBClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (XGBClassifier, {'random_state':random_state, 'n_estimators': 500}),
    
    lambda random_state: (CatBoostClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (CatBoostClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (CatBoostClassifier, {'random_state':random_state, 'n_estimators': 500}),
]

grid_func = lambda factory, cv_repeat_n, model_repeat_n, folds_n: cross_val(factory, X, y, cv_repeat_n, model_repeat_n, folds_n)

all_trained_models.clear()
all_results = []
def on_1_completed(results):
    all_results.append(results)
    clear_output(wait=True)
    get_stats_df(all_results)
    display_stats(get_stats_df(all_results))
    
cv_repeat_list = [3]
model_repeat_list = [20]
folds_list = [5]

_ = grid_exec_callback(grid_func, on_1_completed, all_models, cv_repeat_list, model_repeat_list, folds_list)

Unnamed: 0,model,##,params,cv_repeat_n,folds_n,model_repeat_n,min,mean,max,std,folds_mean_min,folds_mean_max,folds_mean_std
0,DecisionTreeClassifier,15,,3,5,20,0.725,0.855,0.975,0.057,0.729,0.934,0.053
1,KNeighborsClassifier,14,,3,5,20,0.869,0.939,0.995,0.032,0.869,0.995,0.032
2,RandomForestClassifier,10,'n_estimators': 50,3,5,20,0.937,0.984,1.0,0.016,0.959,1.0,0.015
3,RandomForestClassifier,7,'n_estimators': 200,3,5,20,0.95,0.986,1.0,0.015,0.964,1.0,0.014
4,RandomForestClassifier,6,'n_estimators': 600,3,5,20,0.96,0.986,1.0,0.014,0.965,1.0,0.014
5,AdaBoostClassifier,9,'n_estimators': 50,3,5,20,0.943,0.983,1.0,0.018,0.942,1.0,0.018
6,AdaBoostClassifier,4,'n_estimators': 200,3,5,20,0.963,0.989,1.0,0.012,0.962,1.0,0.012
7,AdaBoostClassifier,1,'n_estimators': 500,3,5,20,0.973,0.991,1.0,0.009,0.972,1.0,0.009
8,LGBMClassifier,8,'n_estimators': 50,3,5,20,0.948,0.983,1.0,0.017,0.948,1.0,0.017
9,LGBMClassifier,3,'n_estimators': 200,3,5,20,0.965,0.989,1.0,0.012,0.965,1.0,0.012


Learning rate set to 0.020609
0:	learn: 0.6750262	total: 2.64ms	remaining: 525ms
1:	learn: 0.6546799	total: 4.37ms	remaining: 432ms
2:	learn: 0.6332955	total: 8.12ms	remaining: 534ms
3:	learn: 0.6108807	total: 9.91ms	remaining: 486ms
4:	learn: 0.5927722	total: 11.4ms	remaining: 446ms
5:	learn: 0.5730431	total: 13.5ms	remaining: 437ms
6:	learn: 0.5601833	total: 15ms	remaining: 414ms
7:	learn: 0.5445139	total: 17.3ms	remaining: 416ms
8:	learn: 0.5262256	total: 19.2ms	remaining: 408ms
9:	learn: 0.5093603	total: 21.2ms	remaining: 403ms
10:	learn: 0.4963978	total: 22.7ms	remaining: 391ms
11:	learn: 0.4813488	total: 25.2ms	remaining: 396ms
12:	learn: 0.4657825	total: 27.4ms	remaining: 394ms
13:	learn: 0.4514323	total: 30.1ms	remaining: 399ms
14:	learn: 0.4369101	total: 32ms	remaining: 395ms
15:	learn: 0.4223847	total: 33.6ms	remaining: 386ms
16:	learn: 0.4137528	total: 35.2ms	remaining: 379ms
17:	learn: 0.3998520	total: 36.7ms	remaining: 371ms
18:	learn: 0.3889842	total: 38.2ms	remaining: 36

## Shortlist models

## Shortlist based on [model_repeat_n]

In [19]:
all_models = [
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 400}),
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 800}),
    
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 400}),
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 800}),
    
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 100}),
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 300}),
]

grid_func = lambda factory, cv_repeat_n, model_repeat_n, folds_n: cross_val(factory, X, y, cv_repeat_n, model_repeat_n, folds_n)

all_trained_models.clear()
all_results = []
def on_1_completed(results):
    all_results.append(results)
    clear_output(wait=True)
    get_stats_df(all_results)
    display_stats(get_stats_df(all_results))
    
cv_repeat_list = [2]
model_repeat_list = [1, 5, 15, 30]
folds_list = [5]

_ = grid_exec_callback(grid_func, on_1_completed, all_models, cv_repeat_list, model_repeat_list, folds_list)

Unnamed: 0,model,##,params,cv_repeat_n,folds_n,model_repeat_n,min,mean,max,std,folds_mean_min,folds_mean_max,folds_mean_std
0,RandomForestClassifier,12,'n_estimators': 200,2,5,1,0.968,0.987,1.0,0.013,0.968,1.0,0.013
1,RandomForestClassifier,19,'n_estimators': 200,2,5,5,0.96,0.988,1.0,0.014,0.965,1.0,0.014
2,RandomForestClassifier,33,'n_estimators': 200,2,5,15,0.932,0.984,1.0,0.019,0.949,1.0,0.019
3,RandomForestClassifier,31,'n_estimators': 200,2,5,30,0.941,0.983,1.0,0.016,0.953,1.0,0.016
4,RandomForestClassifier,14,'n_estimators': 400,2,5,1,0.968,0.986,1.0,0.014,0.968,1.0,0.014
5,RandomForestClassifier,21,'n_estimators': 400,2,5,5,0.959,0.988,1.0,0.015,0.963,1.0,0.014
6,RandomForestClassifier,32,'n_estimators': 400,2,5,15,0.935,0.985,1.0,0.019,0.948,1.0,0.019
7,RandomForestClassifier,30,'n_estimators': 400,2,5,30,0.943,0.983,1.0,0.016,0.953,1.0,0.016
8,RandomForestClassifier,17,'n_estimators': 800,2,5,1,0.965,0.987,1.0,0.014,0.965,1.0,0.014
9,RandomForestClassifier,24,'n_estimators': 800,2,5,5,0.958,0.988,1.0,0.015,0.963,1.0,0.015


## Shortlist based on [model_repeat_n] v2

In [22]:
all_models = [
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 400}),
    
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 400}),
    
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 200}),
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 300}),
]

grid_func = lambda factory, cv_repeat_n, model_repeat_n, folds_n: cross_val(factory, X, y, cv_repeat_n, model_repeat_n, folds_n)

all_trained_models.clear()
all_results = []
def on_1_completed(results):
    all_results.append(results)
    clear_output(wait=True)
    get_stats_df(all_results)
    display_stats(get_stats_df(all_results))
    
cv_repeat_list = [3]
model_repeat_list = [30, 60, 90]
folds_list = [5]

_ = grid_exec_callback(grid_func, on_1_completed, all_models, cv_repeat_list, model_repeat_list, folds_list)

Unnamed: 0,model,##,params,cv_repeat_n,folds_n,model_repeat_n,min,mean,max,std,folds_mean_min,folds_mean_max,folds_mean_std
0,RandomForestClassifier,11,'n_estimators': 200,3,5,30,0.941,0.983,1.0,0.014,0.953,1.0,0.014
1,RandomForestClassifier,16,'n_estimators': 200,3,5,60,0.936,0.984,1.0,0.018,0.95,1.0,0.017
2,RandomForestClassifier,7,'n_estimators': 200,3,5,90,0.943,0.985,1.0,0.014,0.953,1.0,0.014
3,RandomForestClassifier,12,'n_estimators': 400,3,5,30,0.943,0.983,1.0,0.014,0.953,1.0,0.014
4,RandomForestClassifier,15,'n_estimators': 400,3,5,60,0.94,0.984,1.0,0.017,0.95,1.0,0.017
5,RandomForestClassifier,6,'n_estimators': 400,3,5,90,0.948,0.986,1.0,0.014,0.954,1.0,0.014
6,AdaBoostClassifier,9,'n_estimators': 200,3,5,30,0.94,0.987,1.0,0.017,0.94,1.0,0.017
7,AdaBoostClassifier,10,'n_estimators': 200,3,5,60,0.945,0.986,1.0,0.018,0.945,1.0,0.018
8,AdaBoostClassifier,5,'n_estimators': 200,3,5,90,0.955,0.987,1.0,0.016,0.955,1.0,0.016
9,AdaBoostClassifier,1,'n_estimators': 400,3,5,30,0.958,0.99,1.0,0.012,0.957,1.0,0.012


## Shortlist based on [cv_repeat_n]

## Shortlist based on [folds_n]

## Shortlist

In [None]:
all_models = [
    # lambda random_state: (DecisionTreeClassifier, {}),
    # lambda random_state: (KNeighborsClassifier,   {}),

    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 20}),
    # lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 600}),
    
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 20}),
    # lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 600}),
    
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 20}),
    # lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 600}),
]
    # lambda random_state: (XGBClassifier,          {'random_state':random_state}),

grid_func = lambda factory, cv_repeat_n, model_repeat_n, folds_n: cross_val(factory, X, y, cv_repeat_n, model_repeat_n, folds_n)

all_trained_models.clear()
all_results = []
def on_1_completed(results):
    all_results.append(results)
    clear_output(wait=True)
    get_stats_df(all_results)
    display_stats(get_stats_df(all_results))

In [None]:
cv_repeat_list = [1]
model_repeat_list = [1]
folds_list = [3]

_ = grid_exec_callback(grid_func, on_1_completed, all_models, cv_repeat_list, model_repeat_list, folds_list)

In [None]:
all_trained_models

In [None]:
m = all_trained_models[0]
pd.Series(data = m.feature_importances_, index = m.feature_names_in_).sort_values(ascending=False)

m = all_trained_models[6]
pd.Series(data = m.feature_importances_, index = m.feature_names_in_).sort_values(ascending=False)

m = all_trained_models[12]
pd.Series(data = m.feature_importances_, index = m.feature_name_).sort_values(ascending=False)

In [None]:
all_fe = []

for m in all_trained_models:
    if 'feature_names_in_' in m.__dict__.keys():
        all_fe.append(
            pd.Series(data = m.feature_importances_, index = m.feature_names_in_).rank().astype(int)
        )
    else:
        all_fe.append(
            pd.Series(data = m.feature_importances_, index = m.feature_name_).rank().astype(int)
        )

fe = pd.concat(all_fe, axis=1).sort_values(by=[0, 1, 2], ascending=False).head(15)
fe

In [None]:
all_fe = []

for i, m in enumerate(all_trained_models):
    if 'feature_names_in_' in m.__dict__.keys():
        all_fe.append(
            pd.Series(data = m.feature_importances_, index = m.feature_names_in_, name=f'{m.__class__.__name__[0]}_{i}')#.rank().astype(int)
        )
    else:
        all_fe.append(
            pd.Series(data = m.feature_importances_, index = m.feature_name_, name=f'{m.__class__.__name__[0]}_{i}')
        )

fe = pd.concat(all_fe, axis=1)
fe['sum_1'] = fe.iloc[:, :6].sum(axis=1)
fe['sum_2'] = fe.iloc[:, 6:12].sum(axis=1)
fe['sum_3'] = fe.iloc[:, 12:].sum(axis=1)
fe.sort_values(by=['sum_1', 'sum_2', 'sum_3'], ascending=False)\
    .round(2)\
    .style\
    .format('{:,.2f}', subset=fe.columns[:12])\
    .format('{:,.2f}', subset=fe.columns[-3:])\
    .highlight_max(color='green')\
    .highlight_min(color='blue')\
    .highlight_max(color='red', subset=['sum_1', 'sum_2', 'sum_3'])