In [121]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [84]:
train = pd.read_csv("data/Task_2/account_histroy_data.csv", index_col=1).drop("RowNumber", axis=1)
test = pd.read_csv("data/Task_2/existing_account.csv", index_col=0)

# drop unused columns
drop_cols = ["Surname"]
train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

# binarize categorical columns
cat_cols = ["Geography", "Gender"]
for column in cat_cols:
    train = pd.merge(
        train.drop(column, axis=1),
        pd.get_dummies(train[column]).add_prefix(column + "_"),
        left_index=True, right_index=True
    )
    test = pd.merge(
        test.drop(column, axis=1),
        pd.get_dummies(test[column]).add_prefix(column + "_"),
        left_index=True, right_index=True
    )
    
train.head()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
15634602,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
15619304,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
15701354,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
15737888,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [148]:
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score, recall_score, cohen_kappa_score

metric_funcs = {
    "log_loss": log_loss,
    "recall_score": recall_score,
    "accuracy_score": accuracy_score,
    "cohen_kappa_score": cohen_kappa_score
}


def cross_validate(Model, params, data, t_col = "Exited"):
    log = []
    for i, (tr_i, t_i) in enumerate(KFold(n_splits=5).split(data)):
        X_tr, y_tr = data.drop(t_col, axis=1).iloc[tr_i], data[t_col].iloc[tr_i]
        X_t, y_t = data.drop(t_col, axis=1).iloc[t_i], data[t_col].iloc[t_i]
        model = Model(**params)
        model.fit(X_tr, y_tr)
        pred = model.predict(X_t)
        log.append({
            **{n: f(y_t, pred) for n, f in metric_funcs.items()},
            **params
        })
        
    return pd.DataFrame(log).mean()

In [149]:
from multiprocessing import cpu_count

N = 5
CORES = cpu_count()

# sample from generated parameter space for cv
def sample(space):
    params = {}
    for k in space.keys():
        params[k] = random.choice(space[k])

    return params

# iterate over param spaces and aggregate results
def cv_for_params(Model, param_space, param_statics):
    return pd.concat([
        cross_validate(
            Model,
            {**sample(param_space), **param_statics},
            train
        ) for i in range(N)
    ])

In [None]:
from xgboost import XGBClassifier

param_space = {
    'learning_rate': np.geomspace(1e-2, 1),
    'max_depth': list(range(1, 10)),
    'gamma': np.geomspace(1e-2, 1),
    'min_child_weight': list(range(1, 10)),
    'num_estimators': list(range(30, 300)),
    'reg_alpha': np.linspace(0.2, 1),
    'reg_lambda': np.linspace(0.2, 2),
    'scale_pos_weight': np.linspace(0.3, 2)
}
param_statics = {'n_jobs': CORES}

xgb_results = cv_for_params(
    XGBClassifier,
    param_space,
    param_statics,
)
xgb_results

In [114]:
from sklearn.neighbors import KNeighborsClassifier

param_space = {
    'n_neighbors': list(range(1, 25)),
}
param_statics = {'n_jobs': CORES}

knn_results = pd.concat([
    cross_validate(
        KNeighborsClassifier,
        {**sample(param_space), **param_statics},
        train
    ) for i in range(N)
])
knn_results

Unnamed: 0,accuracy_score,cohen_kappa_score,log_loss,recall_score,accuracy_score.1,cohen_kappa_score.1,log_loss.1,recall_score.1,accuracy_score.2,cohen_kappa_score.2,log_loss.2,recall_score.2,accuracy_score.3,cohen_kappa_score.3,log_loss.3,recall_score.3,accuracy_score.4,cohen_kappa_score.4,log_loss.4,recall_score.4
min,0.664433,-0.008826,10.308333,0.200521,0.766495,0.013659,7.374471,0.052083,0.778866,-0.003354,6.925565,0.007557,0.781443,-0.005116,6.836542,0.0,0.754124,-0.006582,7.619917,0.039062
max,0.701546,0.050415,11.590203,0.223058,0.786488,0.04942,8.065008,0.077694,0.799485,0.022355,7.637705,0.023438,0.802062,0.00799,7.548682,0.005038,0.779381,0.030147,8.4923,0.06383
mean,0.682854,0.014124,10.953974,0.208869,0.776576,0.034486,7.71682,0.065304,0.790495,0.010933,7.236066,0.018148,0.795031,0.00191,7.079377,0.002502,0.771214,0.011948,7.902012,0.052714


In [127]:
from sklearn.neural_network import MLPClassifier

param_space = {
    'hidden_layer_sizes': tuple([
        [
            random.choice(list(range(2, train.shape[1])))
            for i in range(random.choice(list(range(1,4))))]
    ]),
    'activation': ['relu', 'tanh', 'logistic']
}
param_statics = {'early_stopping': True}

ffn_results = pd.concat([
    cross_validate(
        MLPClassifier,
        {**sample(param_space), **param_statics},
        train
    ) for i in range(N)
], axis=1)
ffn_results



KeyboardInterrupt: 