In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import randint
from time import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score

In [2]:
X_train_inputfile = "../dataset/3.1_X_train.csv.gz"
X_valid_inputfile = "../dataset/3.1_X_valid.csv.gz"
y_train_inputfile = "../dataset/3.1_y_train.csv.gz"
y_valid_inputfile = "../dataset/3.1_y_valid.csv.gz"
X_train = pd.read_csv(X_train_inputfile)
X_valid = pd.read_csv(X_valid_inputfile)
y_train = pd.read_csv(y_train_inputfile).transpose().values[0]
y_valid = pd.read_csv(y_valid_inputfile).transpose().values[0]

In [3]:
ada_model = AdaBoostClassifier()

In [4]:
param_dist = {
    'n_estimators': randint(50, 301),
    'learning_rate': stats.uniform(0.001, 1),
    'algorithm': ['SAMME', 'SAMME.R']
}

In [5]:
n_iter_search = 3

In [6]:
def _recall_on_deceased(y, y_pred, **kwargs):
    y_series = pd.Series(y)
    y_deceased = y_series[y_series == 0]
    y_pred_deceased = pd.Series(y_pred)[y_deceased.index]
    return recall_score(
        y_true = y_deceased, 
        y_pred = y_pred_deceased, 
        average = 'micro'
    )

scoring = {
    'Accuracy': make_scorer(accuracy_score), 
    'Recall': make_scorer(
        lambda y, y_pred, **kwargs:
            recall_score(
                y_true = y, 
                y_pred = y_pred, 
                average = 'micro'
            )
    ), 
    'Recall_on_deceased': make_scorer(
        lambda y, y_pred, **kwargs:
            _recall_on_deceased(y, y_pred, **kwargs)
    )
}

In [7]:
random_search = RandomizedSearchCV(
    ada_model, 
    param_distributions = param_dist, 
    n_iter = n_iter_search, 
    n_jobs = -1, 
    scoring = scoring, 
    refit = 'Recall_on_deceased'
)

In [8]:
%time random_search.fit(X_train, y_train)

Wall time: 12min 54s


RandomizedSearchCV(estimator=AdaBoostClassifier(), n_iter=3, n_jobs=-1,
                   param_distributions={'algorithm': ['SAMME', 'SAMME.R'],
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10D59770>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10D596D0>},
                   refit='Recall_on_deceased',
                   scoring={'Accuracy': make_scorer(accuracy_score),
                            'Recall': make_scorer(<lambda>),
                            'Recall_on_deceased': make_scorer(<lambda>)})

In [9]:
random_search.cv_results_

{'mean_fit_time': array([115.17695956,  94.05170364, 213.80398779]),
 'std_fit_time': array([ 1.1940724 ,  3.36383274, 40.97116076]),
 'mean_score_time': array([ 4.38477325,  3.31449208, 11.4814754 ]),
 'std_score_time': array([0.13919355, 0.12028645, 2.33911772]),
 'param_algorithm': masked_array(data=['SAMME', 'SAMME', 'SAMME.R'],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_learning_rate': masked_array(data=[0.15018393631299687, 0.5446477260341165,
                    0.19150210192969064],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[147, 127, 275],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'algorithm': 'SAMME',
   'learning_rate': 0.15018393631299687,
   'n_estimators': 147},
  {'algorithm': 'SAMME',
   'learning_rate': 0.5446477260341165,
   'n_estimators': 127},
  {'algo

In [10]:
def report(results, n_top = 5):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_Recall_on_deceased'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Accuracy: {0:.3f}".format(results['mean_test_Accuracy'][candidate]))
            print("Overall recall: {0:.3f}".format(results['mean_test_Recall'][candidate]))
            print("Recall on 'deceased': {0:.3f}".format(results['mean_test_Recall_on_deceased'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [11]:
report(random_search.cv_results_, 3)

Model with rank: 1
Accuracy: 0.759
Overall recall: 0.759
Recall on 'deceased': 0.002
Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 0.19150210192969064, 'n_estimators': 275}

Model with rank: 2
Accuracy: 0.775
Overall recall: 0.775
Recall on 'deceased': 0.000
Parameters: {'algorithm': 'SAMME', 'learning_rate': 0.15018393631299687, 'n_estimators': 147}

Model with rank: 2
Accuracy: 0.772
Overall recall: 0.772
Recall on 'deceased': 0.000
Parameters: {'algorithm': 'SAMME', 'learning_rate': 0.5446477260341165, 'n_estimators': 127}

