In [121]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [84]:
train = pd.read_csv("data/Task_2/account_histroy_data.csv", index_col=1).drop("RowNumber", axis=1)
test = pd.read_csv("data/Task_2/existing_account.csv", index_col=0)

# drop unused columns
drop_cols = ["Surname"]
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

# binarize categorical columns
cat_cols = ["Geography", "Gender"]
for column in cat_cols:
    train = pd.merge(
        train.drop(column, axis=1),
        pd.get_dummies(train[column]).add_prefix(column + "_"),
        left_index=True, right_index=True
    )
    test = pd.merge(
        test.drop(column, axis=1),
        pd.get_dummies(test[column]).add_prefix(column + "_"),
        left_index=True, right_index=True
    )
    
train.head()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
15634602,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
15619304,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
15701354,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
15737888,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [200]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, recall_score, cohen_kappa_score

metric_funcs = {
    "log_loss": log_loss,
    "recall_score": recall_score,
    "accuracy_score": accuracy_score,
    "cohen_kappa_score": cohen_kappa_score
}


def cross_validate(Model, params, data, t_col = "Exited"):
    log = []
    for i, (tr_i, t_i) in enumerate(KFold(n_splits=5).split(data)):
        X_tr, y_tr = data.drop(t_col, axis=1).iloc[tr_i], data[t_col].iloc[tr_i]
        X_t, y_t = data.drop(t_col, axis=1).iloc[t_i], data[t_col].iloc[t_i]
        model = Model(**params)
        model.fit(X_tr, y_tr)
        pred = model.predict(X_t)
        log.append({
            **{n: f(y_t, pred) for n, f in metric_funcs.items()},
            **params
        })
        
    log_df = pd.DataFrame(log).mean().to_frame().transpose()
    for k, v in params.items():
        log_df[k] = str(v)
        
    return log_df

In [201]:
from multiprocessing import cpu_count

N = 100
CORES = cpu_count()

# sample from generated parameter space for cv
def sample(space):
    params = {}
    for k in space.keys():
        params[k] = random.choice(space[k])

    return params

# iterate over param spaces and aggregate results
def cv_for_params(Model, param_space, param_statics):
    return pd.concat([
        cross_validate(
            Model,
            {**sample(param_space), **param_statics},
            train
        ) for i in range(N)
    ]).reset_index(drop=True)

In [197]:
from sklearn.neighbors import KNeighborsClassifier

param_space = {
    'n_neighbors': list(range(1, 25)),
}
param_statics = {'n_jobs': CORES}

knn_results = cv_for_params(
    KNeighborsClassifier,
    param_space,
    param_statics,
)
knn_results.sort_values("accuracy_score", ascending=False)

Unnamed: 0,index,accuracy_score,cohen_kappa_score,log_loss,n_jobs,n_neighbors,recall_score
2,0,0.795031,0.008396,7.079376,4,18,0.008059
4,0,0.794309,0.012055,7.104304,4,19,0.012538
3,0,0.793381,0.015444,7.136364,4,17,0.017095
1,0,0.786371,0.013627,7.378508,4,11,0.027813
0,0,0.776576,0.034486,7.71682,4,7,0.065304


In [198]:
from xgboost import XGBClassifier

param_space = {
    'learning_rate': np.geomspace(1e-2, 1),
    'max_depth': list(range(1, 10)),
    'gamma': np.geomspace(1e-2, 1),
    'min_child_weight': list(range(1, 10)),
    'num_estimators': list(range(30, 300)),
    'reg_alpha': np.linspace(0.2, 1),
    'reg_lambda': np.linspace(0.2, 2),
    'scale_pos_weight': np.linspace(0.3, 2)
}
param_statics = {'n_jobs': CORES}

xgb_results = cv_for_params(
    XGBClassifier,
    param_space,
    param_statics,
)
xgb_results.sort_values("accuracy_score", ascending=False)

Unnamed: 0,index,accuracy_score,cohen_kappa_score,gamma,learning_rate,log_loss,max_depth,min_child_weight,n_jobs,num_estimators,recall_score,reg_alpha,reg_lambda,scale_pos_weight
4,0,0.863801,0.498929,0.828643,0.025595,4.704175,7,1,4,110,0.446699,0.281633,0.383673,0.855102
1,0,0.85947,0.473031,0.019307,0.02121,4.853746,5,7,4,33,0.414694,0.363265,1.081633,0.889796
2,0,0.856583,0.433578,0.07906,0.244205,4.953443,3,2,4,168,0.351342,0.983673,0.530612,0.404082
3,0,0.85184,0.522372,0.202359,0.152642,5.117317,8,7,4,34,0.575987,0.428571,1.926531,1.861224
0,0,0.841633,0.33404,0.294705,0.017575,5.469801,4,4,4,33,0.25126,0.853061,0.75102,0.369388


In [207]:
from sklearn.neural_network import MLPClassifier

param_space = {
    'hidden_layer_sizes': tuple([
        random.choice(list(range(2, train.shape[1])))
        for i in range(random.choice(list(range(1,4))))
    ] for j in range(100)),
    'activation': ['relu', 'tanh', 'logistic']
}
param_statics = {'early_stopping': True}

ffn_results = cv_for_params(
    MLPClassifier,
    param_space,
    param_statics,
)
ffn_results.sort_values("accuracy_score", ascending=False)

Unnamed: 0,accuracy_score,cohen_kappa_score,early_stopping,log_loss,recall_score,hidden_layer_sizes,activation
2,0.795547,0.0,True,7.061563,0.0,"[3, 5]",logistic
1,0.795238,-2.8e-05,True,7.072246,0.000501,"[7, 8, 13]",tanh
3,0.794619,-0.001256,True,7.093618,0.000501,"[8, 8]",tanh
4,0.791731,0.008907,True,7.193375,0.014737,[12],tanh
0,0.787914,0.010613,True,7.325193,0.023684,[10],tanh
