In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = False)

In [None]:
import os
base_path = "/content/drive/My Drive/CMPT459_Dataset/"
os.chdir(base_path);

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import randint
from time import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from pprint import pprint

In [None]:
X_train_inputfile = "./3.1_X_train.csv.gz"
X_valid_inputfile = "./3.1_X_valid.csv.gz"
y_train_inputfile = "./3.1_y_train.csv.gz"
y_valid_inputfile = "./3.1_y_valid.csv.gz"
X_train = pd.read_csv(X_train_inputfile)
X_valid = pd.read_csv(X_valid_inputfile)
y_train = pd.read_csv(y_train_inputfile).transpose().values[0]
y_valid = pd.read_csv(y_valid_inputfile).transpose().values[0]

In [None]:
ada_model = AdaBoostClassifier()

In [None]:
param_dist = {
    'n_estimators': range(50, 301, 15), 
    'learning_rate': np.arange(0.001, 1, 0.1), 
    'algorithm': ['SAMME', 'SAMME.R']
}

In [None]:
def _recall_on_deceased(y, y_pred, **kwargs):
    y_series = pd.Series(y)
    y_deceased = y_series[y_series == 0]
    y_pred_deceased = pd.Series(y_pred)[y_deceased.index]
    return recall_score(
        y_true = y_deceased, 
        y_pred = y_pred_deceased, 
        average = 'micro'
    )

scoring = {
    'Accuracy': make_scorer(accuracy_score), 
    'Recall': make_scorer(
        lambda y, y_pred, **kwargs:
            recall_score(
                y_true = y, 
                y_pred = y_pred, 
                average = 'micro'
            )
    ), 
    'Recall_on_deceased': make_scorer(
        lambda y, y_pred, **kwargs:
            _recall_on_deceased(y, y_pred, **kwargs)
    )
}

In [None]:
grid_search = GridSearchCV(
    ada_model, 
    param_grid = param_dist, 
    scoring = scoring, 
    n_jobs = -1, 
    refit = 'Recall_on_deceased'
)

In [None]:
%time grid_search.fit(X_train, y_train)

In [None]:
def report(results, n_top = 5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_Recall_on_deceased'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Accuracy: {0:.3f}".format(results['mean_test_Accuracy'][candidate]))
            print("Overall recall: {0:.3f}".format(results['mean_test_Recall'][candidate]))
            print("Recall on 'deceased': {0:.3f}".format(results['mean_test_Recall_on_deceased'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
report(grid_search.cv_results_, 20)

In [None]:
pprint(grid_search.cv_results_)

In [None]:
print(grid_search.best_estimator_)

In [None]:
ada_pkl = './ADA_GridSearch_Best.pkl'
pickle.dump(grid_search.best_estimator_, open(ada_pkl, 'wb'))