In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = False)

Mounted at /content/drive


In [None]:
import os
base_path = "/content/drive/My Drive/data"
os.chdir(base_path);

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import randint
from time import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from pprint import pprint

In [None]:
X_train_inputfile = "./3.1_X_train.csv.gz"
X_valid_inputfile = "./3.1_X_valid.csv.gz"
y_train_inputfile = "./3.1_y_train.csv.gz"
y_valid_inputfile = "./3.1_y_valid.csv.gz"
X_train = pd.read_csv(X_train_inputfile)
X_valid = pd.read_csv(X_valid_inputfile)
y_train = pd.read_csv(y_train_inputfile).transpose().values[0]
y_valid = pd.read_csv(y_valid_inputfile).transpose().values[0]

In [None]:
knn_model = KNeighborsClassifier(algorithm = 'auto')

In [None]:
param_dist = {
    'n_neighbors': randint(5, 100),
    'leaf_size': randint(10, 500)
}

In [None]:
n_iter_search = 50

In [None]:
def _recall_on_deceased(y, y_pred, **kwargs):
    y_series = pd.Series(y)
    y_deceased = y_series[y_series == 0]
    y_pred_deceased = pd.Series(y_pred)[y_deceased.index]
    return recall_score(
        y_true = y_deceased, 
        y_pred = y_pred_deceased, 
        average = 'micro'
    )

scoring = {
    'Accuracy': make_scorer(accuracy_score), 
    'Recall': make_scorer(
        lambda y, y_pred, **kwargs:
            recall_score(
                y_true = y, 
                y_pred = y_pred, 
                average = 'micro'
            )
    ), 
    'Recall_on_deceased': make_scorer(
        lambda y, y_pred, **kwargs:
            _recall_on_deceased(y, y_pred, **kwargs)
    )
}

In [None]:
random_search = RandomizedSearchCV(
    knn_model, 
    param_distributions = param_dist, 
    n_iter = n_iter_search,
    scoring = scoring, 
    refit = 'Recall_on_deceased'
)

In [None]:
%time random_search.fit(X_train, y_train)

CPU times: user 1h 41min 25s, sys: 7.9 s, total: 1h 41min 33s
Wall time: 1h 41min 43s


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='deprecated', n_iter=50, n_jobs=None,
                   param_distributions={'leaf_size': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f6b530ed9e8>,
                                        'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f6b618d5198>},
                   pre_dispatch='2*n_jobs', random_state=None,
                   refit='Recall_on_deceased', return_train_score=False,
                   scoring={'Accuracy': make_scorer(accuracy_score),
           

In [None]:
def report(results, n_top = 5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_Recall_on_deceased'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Accuracy: {0:.3f}".format(results['mean_test_Accuracy'][candidate]))
            print("Overall recall: {0:.3f}".format(results['mean_test_Recall'][candidate]))
            print("Recall on 'deceased': {0:.3f}".format(results['mean_test_Recall_on_deceased'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

report(random_search.cv_results_)

Model with rank: 1
Accuracy: 0.768
Overall recall: 0.768
Recall on 'deceased': 0.017
Parameters: {'leaf_size': 329, 'n_neighbors': 5}

Model with rank: 2
Accuracy: 0.781
Overall recall: 0.781
Recall on 'deceased': 0.016
Parameters: {'leaf_size': 193, 'n_neighbors': 6}

Model with rank: 2
Accuracy: 0.781
Overall recall: 0.781
Recall on 'deceased': 0.016
Parameters: {'leaf_size': 98, 'n_neighbors': 6}

Model with rank: 4
Accuracy: 0.775
Overall recall: 0.775
Recall on 'deceased': 0.012
Parameters: {'leaf_size': 151, 'n_neighbors': 7}

Model with rank: 5
Accuracy: 0.788
Overall recall: 0.788
Recall on 'deceased': 0.010
Parameters: {'leaf_size': 152, 'n_neighbors': 12}

Model with rank: 5
Accuracy: 0.788
Overall recall: 0.788
Recall on 'deceased': 0.010
Parameters: {'leaf_size': 494, 'n_neighbors': 12}



In [None]:
from collections import Counter
Counter(random_search.predict(X_train))

Counter({0: 743, 1: 171430, 2: 109964, 3: 50208})

In [None]:
random_search.cv_results_['mean_test_Recall_on_deceased']

array([0.00619888, 0.        , 0.        , 0.        , 0.01162224,
       0.00594015, 0.01601267, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00258231, 0.0043911 , 0.        ,
       0.        , 0.0043911 , 0.01007252, 0.01007252, 0.00723181,
       0.        , 0.00542402, 0.        , 0.00516596, 0.        ,
       0.        , 0.0002584 , 0.        , 0.00516596, 0.0043911 ,
       0.01704526, 0.        , 0.        , 0.00490756, 0.00697374,
       0.        , 0.        , 0.        , 0.        , 0.00723181,
       0.        , 0.        , 0.        , 0.00774794, 0.00129199,
       0.        , 0.00774794, 0.0002584 , 0.01601267, 0.00619888])

In [None]:
import pickle
knn_pkl = './KNN_rsCV.pkl'
pickle.dump(random_search.best_estimator_, open(knn_pkl, 'wb'))