In [1]:
!pip install xgboost
!pip install lightgbm



In [2]:
%load_ext autoreload
%autoreload 1
%aimport transformations

%aimport transformations
%aimport f

transformations loaded, version: 0.1
framework loaded, version: 0.1


In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from IPython.display import clear_output

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113

import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import StandardScaler, PowerTransformer

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

RS = 35577 # global random state seed
data_path = './data-raw/'

In [56]:
(X, y) = f.from_pkl(f'data-processed/all_b1_b2.data.pkl')

X.shape
X.columns

y.shape
y.value_counts()

(200, 36)

Index(['x__amin', 'x__amax', 'x__sum', 'x__median', 'x__mean', 'x__std',
       'x__var', 'x__q1', 'x__q3', 'x__iqr', 'x__kurtosis', 'x__skew',
       'y__amin', 'y__amax', 'y__sum', 'y__median', 'y__mean', 'y__std',
       'y__var', 'y__q1', 'y__q3', 'y__iqr', 'y__kurtosis', 'y__skew',
       'z__amin', 'z__amax', 'z__sum', 'z__median', 'z__mean', 'z__std',
       'z__var', 'z__q1', 'z__q3', 'z__iqr', 'z__kurtosis', 'z__skew'],
      dtype='object')

(200,)

1    100
0    100
Name: status, dtype: int64

# CV

In [57]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [None]:
import itertools

def grid_exec(func, *args):
    return [func(*element) for element in itertools.product(*args)]

def grid_exec_callback(func, callback, *args):
    return [ callback(func(*element)) for element in itertools.product(*args)]

In [58]:
def cross_val(model_factory, X, y, cv_repeat_n = 1, model_repeat_n = 1, folds_n = 7):
    np.random.seed(RS)
    get_random = lambda  : np.random.randint(1, 10000)
    cv_scores = []
    
    for i_cv in range(cv_repeat_n):
        fold_scores = []
        cv_scores.append(fold_scores)
        
        for i_fold, (idx_train, idx_test) in enumerate(StratifiedKFold(n_splits=folds_n, shuffle=True, random_state=get_random()).split(X, y, y)):
            
            X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
            X_test, y_test = X.iloc[idx_test], y.iloc[idx_test]
            
            model_scores = []
            fold_scores.append(model_scores)

            for i_model in range(model_repeat_n):
                rand = get_random()
                constructor, params_dic = model_factory(rand)
                
                model = constructor(**params_dic)
                # model = model_factory(get_random())
                model.fit(X_train, y_train)

                y_pred = model.predict_proba(X_test)[:, 1]
                score = roc_auc_score(y_test, y_pred)
                model_scores.append(score)
                
            model_name = model.__class__.__name__
            model_params = params_dic.copy()
                
    return model_name, model_params, cv_repeat_n, folds_n, model_repeat_n, np.array(cv_scores)

In [100]:
def get_stats_df(cv_results):
    
    ret_list = []
    
    for result in cv_results:
        
        model_params = result[1]
        model_params.pop('random_state', None)
        stats_ = result[-1]
        
        ret_list.append(
            {
                'model' : result[0],
                '#' : None,
                '##' : None,
                'params' : str(model_params).strip('{').strip('}'),
                'cv_repeat_n' : result[2],
                'folds_n' : result[3],    
                'model_repeat_n' : result[4],

                'min': stats_.min(),
                'mean': stats_.mean(),
                'max': stats_.max(),
                'std': stats_.std(),

                'folds_mean_min': stats_.mean(axis=2).reshape(1,-1).min(),
                # 'folds_mean_mean': stats_.mean(axis=2).reshape(1,-1).mean(),
                'folds_mean_max': stats_.mean(axis=2).reshape(1,-1).max(),
                'folds_mean_std': stats_.mean(axis=2).reshape(1,-1).std(),
            })
    df = pd.DataFrame(ret_list)
    
    df['#'] = df.groupby('model', group_keys=False).apply(
                lambda df: (df['mean'].rank(ascending = False) \
                          + df['min'].rank(ascending = False) \
                          + df['folds_mean_std'].rank(ascending = True)
                           ).rank()).astype(int)
    df['##'] = (df['mean'].rank(ascending = False) \
                      + df['min'].rank(ascending = False) \
                      + df['folds_mean_std'].rank(ascending = True)
                       ).rank().astype(int)
    
    return df

def display_stats(df_stats):
    display(df_stats.style.format('{:,.3f}', df_stats.columns[7:])\
        .background_gradient(cmap='Oranges', subset='#', axis=0)\
        .background_gradient(cmap='Reds', subset='##', axis=0)\
        .background_gradient(cmap='Blues', subset=df_stats.columns[7:11], axis=0)\
        .background_gradient(cmap='Greens', subset=df_stats.columns[11:], axis=0))

In [102]:
display_stats(get_stats_df(all_results))

Unnamed: 0,model,#,##,params,cv_repeat_n,folds_n,model_repeat_n,min,mean,max,std,folds_mean_min,folds_mean_max,folds_mean_std
0,RandomForestClassifier,1,1,'n_estimators': 50,5,5,10,0.932,0.985,1.0,0.016,0.947,1.0,0.015
1,RandomForestClassifier,2,3,'n_estimators': 20,5,5,10,0.916,0.982,1.0,0.018,0.934,1.0,0.017
2,AdaBoostClassifier,1,4,'n_estimators': 50,5,5,10,0.942,0.981,1.0,0.02,0.942,1.0,0.02
3,AdaBoostClassifier,2,6,'n_estimators': 20,5,5,10,0.825,0.966,1.0,0.04,0.825,1.0,0.04
4,LGBMClassifier,1,2,'n_estimators': 50,5,5,10,0.943,0.984,1.0,0.018,0.943,1.0,0.018
5,LGBMClassifier,2,5,'n_estimators': 20,5,5,10,0.925,0.978,1.0,0.02,0.925,1.0,0.02


## Multiple Models

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [63]:
all_models = [
    # lambda random_state: (DecisionTreeClassifier, {}),
    # lambda random_state: (KNeighborsClassifier,   {}),

    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 20}),
    # lambda random_state: (RandomForestClassifier, {'random_state':random_state, 'n_estimators': 600}),
    
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 20}),
    # lambda random_state: (AdaBoostClassifier, {'random_state':random_state, 'n_estimators': 600}),
    
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 50}),
    lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 20}),
    # lambda random_state: (LGBMClassifier, {'random_state':random_state, 'n_estimators': 600}),
]
    # lambda random_state: (XGBClassifier,          {'random_state':random_state}),

grid_func = lambda factory, cv_repeat_n, model_repeat_n, folds_n: cross_val(factory, X, y, cv_repeat_n, model_repeat_n, folds_n)

all_results = []
def on_1_completed(results):
    all_results.append(results)
    clear_output(wait=True)
    get_stats_df(all_results)
    display_stats(get_stats_df(all_results))

In [64]:
cv_repeat_list = [5]
model_repeat_list = [10]
folds_list = [5]

_ = grid_exec_callback(grid_func, on_1_completed, all_models, cv_repeat_list, model_repeat_list, folds_list)

Unnamed: 0,model,params,cv_repeat_n,folds_n,model_repeat_n,min,mean,max,std,folds_mean_min,folds_mean_max,folds_mean_std
0,RandomForestClassifier,'n_estimators': 50,5,5,10,0.93,0.98,1.0,0.02,0.95,1.0,0.02
1,RandomForestClassifier,'n_estimators': 20,5,5,10,0.92,0.98,1.0,0.02,0.93,1.0,0.02
2,AdaBoostClassifier,'n_estimators': 50,5,5,10,0.94,0.98,1.0,0.02,0.94,1.0,0.02
3,AdaBoostClassifier,'n_estimators': 20,5,5,10,0.82,0.97,1.0,0.04,0.82,1.0,0.04
4,LGBMClassifier,'n_estimators': 50,5,5,10,0.94,0.98,1.0,0.02,0.94,1.0,0.02
5,LGBMClassifier,'n_estimators': 20,5,5,10,0.93,0.98,1.0,0.02,0.93,1.0,0.02
