In [1]:
%load_ext autoreload
%autoreload 1
%aimport transformations

transformations loaded, version: 0.1


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 999
pd.options.display.max_rows = 113

import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.preprocessing import StandardScaler, PowerTransformer

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture

RS = 35577 # global random state seed
data_path = './data-raw/'

%aimport transformations
%aimport f

framework loaded, version: 0.1


In [3]:
(X, y) = f.from_pkl(f'data-processed/all_b1_b2.data.pkl')

X.shape
X.head(3)

y.shape
y.head(3)

(200, 36)

Unnamed: 0,x__amin,x__amax,x__sum,x__median,x__mean,x__std,x__var,x__q1,x__q3,x__iqr,x__kurtosis,x__skew,y__amin,y__amax,y__sum,y__median,y__mean,y__std,y__var,y__q1,y__q3,y__iqr,y__kurtosis,y__skew,z__amin,z__amax,z__sum,z__median,z__mean,z__std,z__var,z__q1,z__q3,z__iqr,z__kurtosis,z__skew
0,-6.716668,5.307868,-10988.070315,-0.079124,-0.134299,1.359987,1.849565,-0.944891,0.690446,1.635337,0.64558,-0.260035,-5.812267,5.638507,-2005.74944,-0.039563,-0.024515,1.090317,1.188792,-0.702004,0.622879,1.324883,0.845721,0.190106,-8.080924,8.244501,-15909.421691,-0.206307,-0.194449,1.503315,2.259956,-1.070594,0.754012,1.824606,0.873858,-0.226021
1,-12.103661,11.75302,-18844.323785,-0.17532,-0.239902,2.477716,6.139077,-1.522068,1.075231,2.5973,1.408079,-0.167536,-8.745936,12.452191,-318.869239,-0.134197,-0.004059,2.37612,5.645946,-1.45908,0.906782,2.365862,1.936212,0.887947,-10.193626,10.357203,-6543.554226,-0.110275,-0.083304,2.156482,4.650413,-1.262658,1.13814,2.400798,0.657898,0.065098
2,-7.678631,6.55842,-12546.141906,-0.079124,-0.136898,1.350739,1.824495,-0.752498,0.498054,1.250552,1.485456,-0.097227,-4.298115,5.259969,5780.428883,0.055072,0.063073,0.822131,0.6759,-0.323466,0.43361,0.757076,2.150199,0.258396,-10.289657,8.244501,12884.241287,0.177821,0.140587,1.480194,2.190974,-0.398371,0.850044,1.248415,3.049378,-0.710093


(200,)

0    1
1    1
2    1
Name: status, dtype: int64

# Models

In [24]:
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

In [25]:
all_models = [
    # lambda random_state: LogisticRegression(random_state=random_state),
    lambda random_state: DecisionTreeClassifier(random_state=random_state),
    lambda random_state: RandomForestClassifier(random_state=random_state),
    lambda random_state: AdaBoostClassifier(random_state=random_state),
    lambda random_state: KNeighborsClassifier(),
    lambda random_state: LGBMClassifier(random_state=random_state),
]

# CV

In [26]:
def cross_val(model_factory, X, y, cv_repeat_n = 1, model_repeat_n = 1, folds_n = 7):
    np.random.seed(RS)
    get_random = lambda  : np.random.randint(1, 10000)
    cv_scores = []
    
    for i_cv in range(cv_repeat_n):
        fold_scores = []
        cv_scores.append(fold_scores)
        
        for i_fold, (idx_train, idx_test) in enumerate(StratifiedKFold(n_splits=folds_n, shuffle=True, random_state=get_random()).split(X, y, y)):
            
            X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
            X_test, y_test = X.iloc[idx_test], y.iloc[idx_test]
            
            model_scores = []
            fold_scores.append(model_scores)

            for i_model in range(model_repeat_n):
                model = model_factory(get_random())
                model.fit(X_train, y_train)

                y_pred = model.predict_proba(X_test)[:, 1]
                score = roc_auc_score(y_test, y_pred)
                model_scores.append(score)
                
    model_name = model_factory(0).__class__.__name__
    return model_name, cv_repeat_n, folds_n, model_repeat_n, np.array(cv_scores)

In [27]:
def get_stats(results):
    idx = results[0]
    
    stats_ = results[4]
    return pd.DataFrame(
        {
            'folds_n' : results[2],    
            'cv_repeat_n' : results[1],            
            'model_repeat_n' : results[3],

            'min': stats_.min(),
            'mean': stats_.mean(),
            'max': stats_.max(),
            'std': stats_.std(),

            'fold_min': stats_.mean(axis=2).reshape(1,-1).min(),
            'fold_mean': stats_.mean(axis=2).reshape(1,-1).mean(),
            'fold_max': stats_.mean(axis=2).reshape(1,-1).max(),
            'fold_std': stats_.mean(axis=2).reshape(1,-1).std(),
        },
        index=[idx]
    )

## Multiple Models

In [28]:
%%time 

all_results = []

for cv_i in [1, 10]:
    for fold_i in [8]:
        for model_i in [1, 10, 20]:
            for factory in all_models:
                results = cross_val(
                        factory,
                        X, y, 
                        cv_repeat_n = cv_i,
                        model_repeat_n = model_i,
                        folds_n = fold_i
                    )
                all_results.append(get_stats(results))

CPU times: user 57min 33s, sys: 1min 56s, total: 59min 29s
Wall time: 16min 23s


In [29]:
pd.concat(all_results).sort_index().round(2)

Unnamed: 0,folds_n,cv_repeat_n,model_repeat_n,min,mean,max,std,fold_min,fold_mean,fold_max,fold_std
AdaBoostClassifier,8,10,20,0.89,0.98,1.0,0.02,0.89,0.98,1.0,0.02
AdaBoostClassifier,8,1,1,0.93,0.98,1.0,0.02,0.93,0.98,1.0,0.02
AdaBoostClassifier,8,1,10,0.93,0.98,1.0,0.02,0.93,0.98,1.0,0.02
AdaBoostClassifier,8,10,1,0.9,0.98,1.0,0.02,0.9,0.98,1.0,0.02
AdaBoostClassifier,8,10,10,0.91,0.98,1.0,0.02,0.91,0.98,1.0,0.02
AdaBoostClassifier,8,1,20,0.93,0.98,1.0,0.02,0.93,0.98,1.0,0.02
DecisionTreeClassifier,8,1,1,0.69,0.89,1.0,0.09,0.69,0.89,1.0,0.09
DecisionTreeClassifier,8,10,10,0.64,0.85,1.0,0.07,0.7,0.85,0.98,0.06
DecisionTreeClassifier,8,10,20,0.68,0.86,1.0,0.07,0.71,0.86,0.99,0.06
DecisionTreeClassifier,8,1,10,0.69,0.86,1.0,0.09,0.71,0.86,0.98,0.08
