In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import randint
from time import time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score

In [2]:
X_train_inputfile = "../dataset/3.1_X_train.csv.gz"
X_valid_inputfile = "../dataset/3.1_X_valid.csv.gz"
y_train_inputfile = "../dataset/3.1_y_train.csv.gz"
y_valid_inputfile = "../dataset/3.1_y_valid.csv.gz"
X_train = pd.read_csv(X_train_inputfile)
X_valid = pd.read_csv(X_valid_inputfile)
y_train = pd.read_csv(y_train_inputfile).transpose().values[0]
y_valid = pd.read_csv(y_valid_inputfile).transpose().values[0]

In [3]:
ada_model = AdaBoostClassifier()

In [4]:
param_dist = {
    'n_estimators': randint(50, 301),
    'learning_rate': stats.uniform(0.001, 1),
    'algorithm': ['SAMME', 'SAMME.R']
}
n_iter_search = 2
scoring = {
    'Accuracy': make_scorer(accuracy_score), 
    'Recall': make_scorer(
        lambda y, y_pred, **kwargs: 
            recall_score(y, y_pred, average = 'micro')
    )
}

In [5]:
random_search = RandomizedSearchCV(
    ada_model, 
    param_distributions = param_dist, 
    n_iter = n_iter_search, 
    n_jobs = -1, 
    scoring = scoring, 
    refit = 'Recall'
)

In [6]:
%time random_search.fit(X_train, y_train)

Wall time: 8min 1s


RandomizedSearchCV(estimator=AdaBoostClassifier(), n_iter=2, n_jobs=-1,
                   param_distributions={'algorithm': ['SAMME', 'SAMME.R'],
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1068A9B0>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1068A930>},
                   refit='Recall',
                   scoring={'Accuracy': make_scorer(accuracy_score),
                            'Recall': make_scorer(<lambda>)})

In [7]:
random_search.cv_results_

{'mean_fit_time': array([122.68756323, 143.30676999]),
 'std_fit_time': array([ 8.08577845, 31.69049524]),
 'mean_score_time': array([5.25068226, 4.37364545]),
 'std_score_time': array([0.11599422, 0.9006477 ]),
 'param_algorithm': masked_array(data=['SAMME', 'SAMME'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_learning_rate': masked_array(data=[0.01179167166379591, 0.8449751881496528],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[177, 214],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'algorithm': 'SAMME',
   'learning_rate': 0.01179167166379591,
   'n_estimators': 177},
  {'algorithm': 'SAMME',
   'learning_rate': 0.8449751881496528,
   'n_estimators': 214}],
 'split0_test_Accuracy': array([0.76014383, 0.74652846]),
 'split1_test_Accuracy': array([0.7593916 , 0.75886503]),
 'split2_test_Accuracy'

In [8]:
def report(results, n_top = 2):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_Accuracy'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f}".format(results['mean_test_Accuracy'][candidate]))
            print("Mean validation recall: {0:.3f}".format(results['mean_test_Recall'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [9]:
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.760
Mean validation recall: 0.760
Parameters: {'algorithm': 'SAMME', 'learning_rate': 0.01179167166379591, 'n_estimators': 177}

Model with rank: 2
Mean validation score: 0.737
Mean validation recall: 0.737
Parameters: {'algorithm': 'SAMME', 'learning_rate': 0.8449751881496528, 'n_estimators': 214}



In [28]:
pred = random_search.predict(X_train)

In [29]:
X_train['outcome'] = y_train
X_train['predict_outcome'] = pred
X_train

Unnamed: 0,age,sex,province,country,latitude,longitude,Confirmed,Deaths,Recovered,Active,Incidence_Rate,Case-Fatality_Ratio,outcome,predict_outcome
0,27.000000,2,185,83,-12.02172,-77.03317,349167,14009,0,335158.0,3285.195465,4.012120,2,2
1,41.939287,0,196,48,19.03681,73.01582,1167496,31791,834432,301273.0,948.072083,2.723007,1,1
2,62.973512,0,88,48,28.61474,77.20910,238828,4907,201671,32250.0,1276.409575,2.054617,1,1
3,70.847607,0,360,48,27.56192,80.68265,342788,4869,270094,67825.0,144.099577,1.420411,3,1
4,51.214401,0,337,48,11.93782,79.50211,530908,8685,475717,46506.0,682.039258,1.635877,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332340,20.572365,0,130,48,23.02776,72.60027,120336,3286,100974,16076.0,188.400627,2.730687,1,1
332341,57.610939,0,278,48,31.63347,74.87507,92833,2708,68463,21662.0,307.991942,2.917066,3,1
332342,24.500000,1,241,39,51.21564,6.77666,64692,1844,58524,4324.0,360.749785,2.850430,2,2
332343,59.759377,0,196,48,18.94017,72.83483,1167496,31791,834432,301273.0,948.072083,2.723007,1,1


In [26]:
# X_train = X_train.drop('outcome', axis = 1)

In [37]:
X_train[(X_train['outcome'] == 0) & (X_train['predict_outcome'] == 0)]

Unnamed: 0,age,sex,province,country,latitude,longitude,Confirmed,Deaths,Recovered,Active,Incidence_Rate,Case-Fatality_Ratio,outcome,predict_outcome


In [36]:
recall_score(
    X_train[X_train['outcome'] == 0]['outcome'], 
    X_train[X_train['outcome'] == 0]['predict_outcome'], 
    average = 'micro'
)

0.0