In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import randint
from time import time
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score
from pprint import pprint

In [None]:
X_train_inputfile = "../dataset/3.1_X_train.csv.gz"
X_valid_inputfile = "../dataset/3.1_X_valid.csv.gz"
y_train_inputfile = "../dataset/3.1_y_train.csv.gz"
y_valid_inputfile = "../dataset/3.1_y_valid.csv.gz"
X_train = pd.read_csv(X_train_inputfile)
X_valid = pd.read_csv(X_valid_inputfile)
y_train = pd.read_csv(y_train_inputfile).transpose().values[0]
y_valid = pd.read_csv(y_valid_inputfile).transpose().values[0]

In [None]:
sgd_model = SGDClassifier()

In [None]:
param_dist = {
    'loss': ['hinge', 'modified_huber', 'log'],
    'penalty': ['l1','l2','elasticnet'],
    'max_iter': randint(25, 250)
}

In [None]:
n_iter_search = 15

In [None]:
def _recall_on_deceased(y, y_pred, **kwargs):
    y_series = pd.Series(y)
    y_deceased = y_series[y_series == 0]
    y_pred_deceased = pd.Series(y_pred)[y_deceased.index]
    return recall_score(
        y_true = y_deceased, 
        y_pred = y_pred_deceased, 
        average = 'micro'
    )

scoring = {
    'Accuracy': make_scorer(accuracy_score), 
    'Recall': make_scorer(
        lambda y, y_pred, **kwargs:
            recall_score(
                y_true = y, 
                y_pred = y_pred, 
                average = 'micro'
            )
    ), 
    'Recall_on_deceased': make_scorer(
        lambda y, y_pred, **kwargs:
            _recall_on_deceased(y, y_pred, **kwargs)
    )
}

In [None]:
random_search = RandomizedSearchCV(
    knn_model, 
    param_distributions = param_dist, 
    n_iter = n_iter_search, 
    n_jobs = -1, 
    scoring = scoring, 
    refit = 'Recall_on_deceased'
)

In [None]:
%time random_search.fit(X_train, y_train)


In [None]:

def report(results, n_top = 5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_Recall_on_deceased'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Accuracy: {0:.3f}".format(results['mean_test_Accuracy'][candidate]))
            print("Overall recall: {0:.3f}".format(results['mean_test_Recall'][candidate]))
            print("Recall on 'deceased': {0:.3f}".format(results['mean_test_Recall_on_deceased'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

report(random_search.cv_results_)

In [None]:
from collections import Counter
Counter(random_search.predict(X_train))


In [None]:
random_search.cv_results_['mean_test_Recall_on_deceased']