# Experiment 2.3 - Tune the Ensemble Extra Trees+ Adaboost classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# import load_data function from helper file 
%load_ext autoreload
%autoreload 2

# fix system path
import sys
sys.path.append("/home/jovyan/work")

In [3]:
from src.features.helper_functions import load_sets

X_train, y_train, X_val, y_val, X_test = load_sets()

In [4]:
# Train this model again using the above model
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(bootstrap=True, max_depth=2, class_weight = "balanced_subsample",
                     max_features=0.5114013866515924, n_estimators=1145,
                     n_jobs=1, random_state=1, verbose=False)

etclf = model.fit(X_train, y_train)

In [5]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

In [6]:
# define base model
base = model # from above
# define ensemble model
adaboost = AdaBoostClassifier(base_estimator=base)

In [18]:
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10,50,100]#, 500, 1000]
grid['learning_rate'] = [0.1, 0.5, 1.0, 2.0]

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=1)

# define the grid search procedure
grid_search = GridSearchCV(estimator=adaboost, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc')

In [19]:
grid

{'n_estimators': [10, 50, 100], 'learning_rate': [0.1, 0.5, 1.0, 2.0]}

In [17]:
# try one go and see how long it takes
#one_run = adaboost.fit(X_train,y_train)

In [20]:
# execute the grid search
grid_result = grid_search.fit(X_train, y_train)

# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.698371 using {'learning_rate': 0.1, 'n_estimators': 100}


In [21]:
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.688510 (0.025963) with: {'learning_rate': 0.1, 'n_estimators': 10}
0.696008 (0.027411) with: {'learning_rate': 0.1, 'n_estimators': 50}
0.698371 (0.029197) with: {'learning_rate': 0.1, 'n_estimators': 100}
0.695991 (0.027273) with: {'learning_rate': 0.5, 'n_estimators': 10}
0.698148 (0.031684) with: {'learning_rate': 0.5, 'n_estimators': 50}
0.695619 (0.033082) with: {'learning_rate': 0.5, 'n_estimators': 100}
0.698324 (0.029096) with: {'learning_rate': 1.0, 'n_estimators': 10}
0.695887 (0.033128) with: {'learning_rate': 1.0, 'n_estimators': 50}
0.691430 (0.034600) with: {'learning_rate': 1.0, 'n_estimators': 100}
0.698344 (0.030862) with: {'learning_rate': 2.0, 'n_estimators': 10}
0.691228 (0.034531) with: {'learning_rate': 2.0, 'n_estimators': 50}
0.684341 (0.035018) with: {'learning_rate': 2.0, 'n_estimators': 100}


In [22]:
# lets use it - see what happens
model = AdaBoostClassifier(base_estimator=base, learning_rate = 0.1, n_estimators = 100)


etclf_adb_tune = model.fit(X_train, y_train)

In [23]:
# predict class
y_train_preds = etclf_adb_tune.predict(X_train)
y_val_preds = etclf_adb_tune.predict(X_val)

In [24]:
# predict proabilities
y_train_preds_prob = etclf_adb_tune.predict_proba(X_train)
y_val_preds_prob = etclf_adb_tune.predict_proba(X_val)

In [27]:
from src.features.helper_functions import result_metrics
result_metrics(y_train, y_train_preds,y_train_preds_prob)

Accuracy: 64.61%
Precision: 90.46% 
Recall: 64.25% 
AUC using prediction probabilities: 71.834% 
              precision    recall  f1-score   support

           0       0.27      0.66      0.39      1074
           1       0.90      0.64      0.75      5326

    accuracy                           0.65      6400
   macro avg       0.59      0.65      0.57      6400
weighted avg       0.80      0.65      0.69      6400

Confusion Matrix
[[ 713  361]
 [1904 3422]]


In [28]:
result_metrics(y_val, y_val_preds,y_val_preds_prob)

Accuracy: 63.94%
Precision: 90.66% 
Recall: 63.59% 
AUC using prediction probabilities: 70.613% 
              precision    recall  f1-score   support

           0       0.26      0.66      0.37       257
           1       0.91      0.64      0.75      1343

    accuracy                           0.64      1600
   macro avg       0.58      0.65      0.56      1600
weighted avg       0.80      0.64      0.69      1600

Confusion Matrix
[[169  88]
 [489 854]]


In [30]:
from src.features.helper_functions import save_model
save_model(etclf_adb_tune, 'rez_etclf_adb_tune')

Model saved succesfully


In [None]:
# predict on test set


In [31]:
# create an output for kaggle testing anyway.
y_test_preds = etclf_adb_tune.predict(X_test)
y_test_preds_prob = etclf_adb_tune.predict_proba(X_test)

In [32]:
print(y_test_preds)
print(y_test_preds_prob)

[0 0 1 ... 0 1 0]
[[0.50613205 0.49386795]
 [0.5062189  0.4937811 ]
 [0.47234714 0.52765286]
 ...
 [0.51212681 0.48787319]
 [0.46838484 0.53161516]
 [0.50696639 0.49303361]]


In [33]:
from collections import Counter
unique_elements, counts_elements = np.unique(y_test_preds, return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

[[   0    1]
 [1565 2234]]


In [54]:
1560/2239

0.6967396158999554

In [31]:
from src.features.helper_functions import create_output

In [55]:
output = create_output(y_test_preds_prob)

In [56]:
output

Unnamed: 0,Id,TARGET_5Yrs
0,0,0.496102
1,1,0.495772
2,2,0.519474
3,3,0.542378
4,4,0.494874
...,...,...
3794,3794,0.520148
3795,3795,0.500218
3796,3796,0.492041
3797,3797,0.520674


In [57]:
# save to csv
output.to_csv('../data/processed/output_etclf_adaboost_wk3.csv',index=False)