In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
base_path = "/content/drive/My Drive/data"
os.chdir(base_path);

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import randint
from time import time
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from pprint import pprint

In [None]:
X_train_inputfile = "./3.1_X_train.csv.gz"
X_valid_inputfile = "./3.1_X_valid.csv.gz"
y_train_inputfile = "./3.1_y_train.csv.gz"
y_valid_inputfile = "./3.1_y_valid.csv.gz"
X_train = pd.read_csv(X_train_inputfile)
X_valid = pd.read_csv(X_valid_inputfile)
y_train = pd.read_csv(y_train_inputfile).transpose().values[0]
y_valid = pd.read_csv(y_valid_inputfile).transpose().values[0]

In [None]:
dtree = DecisionTreeClassifier()
ada_model = AdaBoostClassifier(base_estimator=dtree)

In [None]:
param_dist = {
    "base_estimator__max_depth" : randint(50,200),
    "n_estimators": randint(50, 300),
    "learning_rate": stats.uniform(0.1, 1)
}

In [None]:
n_iter_search = 10

In [None]:
def _recall_on_deceased(y, y_pred, **kwargs):
    y_series = pd.Series(y)
    y_deceased = y_series[y_series == 0]
    y_pred_deceased = pd.Series(y_pred)[y_deceased.index]
    return recall_score(
        y_true = y_deceased, 
        y_pred = y_pred_deceased, 
        average = 'micro'
    )

scoring = {
    'Accuracy': make_scorer(accuracy_score), 
    'Recall': make_scorer(
        lambda y, y_pred, **kwargs:
            recall_score(
                y_true = y, 
                y_pred = y_pred, 
                average = 'micro'
            )
    ), 
    'Recall_on_deceased': make_scorer(
        lambda y, y_pred, **kwargs:
            _recall_on_deceased(y, y_pred, **kwargs)
    )
}

In [None]:
random_search = RandomizedSearchCV(
    ada_model, 
    param_distributions = param_dist, 
    n_iter = n_iter_search, 
    n_jobs = -1,
    pre_dispatch='2*n_jobs',
    scoring = scoring, 
    refit = 'Recall_on_deceased'
)

In [None]:
%time random_search.fit(X_train, y_train)



CPU times: user 3min 24s, sys: 1.17 s, total: 3min 25s
Wall time: 1h 50min 45s


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                                base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                                      class_weight=None,
                                                                                      criterion='gini',
                                                                                      max_depth=None,
                                                                                      max_features=None,
                                                                                      max_leaf_nodes=None,
                                                                                      min_impurity_decrease=0.0,
                                                                                      min_impurity_split=None,
                                      

In [None]:
def report(results, n_top = 5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_Recall_on_deceased'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Accuracy: {0:.3f}".format(results['mean_test_Accuracy'][candidate]))
            print("Overall recall: {0:.3f}".format(results['mean_test_Recall'][candidate]))
            print("Recall on 'deceased': {0:.3f}".format(results['mean_test_Recall_on_deceased'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

report(random_search.cv_results_)

Model with rank: 1
Accuracy: 0.736
Overall recall: 0.736
Recall on 'deceased': 0.049
Parameters: {'base_estimator__max_depth': 87, 'learning_rate': 0.7059172808517837, 'n_estimators': 119}

Model with rank: 2
Accuracy: 0.738
Overall recall: 0.738
Recall on 'deceased': 0.049
Parameters: {'base_estimator__max_depth': 198, 'learning_rate': 0.13262494788621962, 'n_estimators': 104}

Model with rank: 3
Accuracy: 0.738
Overall recall: 0.738
Recall on 'deceased': 0.049
Parameters: {'base_estimator__max_depth': 146, 'learning_rate': 0.2214340755857919, 'n_estimators': 51}

Model with rank: 4
Accuracy: 0.737
Overall recall: 0.737
Recall on 'deceased': 0.048
Parameters: {'base_estimator__max_depth': 192, 'learning_rate': 0.8548604398687231, 'n_estimators': 213}

Model with rank: 5
Accuracy: 0.740
Overall recall: 0.740
Recall on 'deceased': 0.047
Parameters: {'base_estimator__max_depth': 138, 'learning_rate': 1.042908023887684, 'n_estimators': 53}



In [None]:
from collections import Counter
Counter(random_search.predict(X_train))

Counter({0: 3696, 1: 148140, 2: 109816, 3: 70693})

In [None]:
random_search.cv_results_['mean_test_Recall_on_deceased']

array([0.04700008, 0.049323  , 0.04364258, 0.04751788, 0.04338518,
       0.04441677, 0.049325  , 0.0488092 , 0.04209486, 0.04622222])

In [None]:
import pickle
ada_pkl = './ADA_rsCV.pkl'
pickle.dump(random_search.best_estimator_, open(ada_pkl, 'wb'))