In [19]:
#set up
%load_ext autoreload
%autoreload 2
from __future__ import print_function
import os,sys
sys.path.append('../')

## Math and dataFrame
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn import datasets
import xgboost as xgb
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
from sklearn.metrics import log_loss, accuracy_score, f1_score, confusion_matrix, roc_auc_score

from bayes_opt import BayesianOptimization


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
def evaluate_features(X, y, clf=None, trn_weights = None):
    """General helper function for evaluating effectiveness of passed features in ML model
    
    Prints out Log loss, accuracy, and confusion matrix with 3-fold stratified cross-validation
    
    Args:
        X (array-like): Features array. Shape (n_samples, n_features)
        
        y (array-like): Labels array. Shape (n_samples,)
        
        clf: Classifier to use. If None, default Log reg is use.
    """
    if clf is None:
        raise ValueError("clf NOne")
    
    probas = cross_val_predict(clf, X, y, cv=KFold(random_state=8), 
                              n_jobs=-1, method='predict_proba', verbose=2,
                              fit_params = {'sample_weight': trn_weights}
                              )
    pred_indices = np.argmax(probas, axis=1)
    classes = np.unique(y)
    preds = classes[pred_indices]
    print('Log loss: {}'.format(log_loss(y, probas)))
    print('Accuracy: {}'.format(accuracy_score(y, preds)))
    print('F1 score: {}'.format(f1_score(y, preds)))
    print('Auc score: {}'.format(roc_auc_score(y, preds)))
    print( confusion_matrix(y, preds) )
    

In [3]:
X, y = datasets.make_classification(n_samples=20000, n_features=20,
                                    n_informative=2, n_redundant=2)

train_samples = 10000  # Samples used for training the models

X_train = X[:train_samples]
X_test = X[train_samples:]
y_train = y[:train_samples]
y_test = y[train_samples:]



In [12]:
clf = xgb.XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.05)


In [16]:
print("baseline xgb")
evaluate_features(X, y, clf=clf, trn_weights = None)

baseline xgb
Log loss: 0.216402683828
Accuracy: 0.91895
F1 score: 0.918742794125
Auc score: 0.918958775835
[[9215  765]
 [ 856 9164]]


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.1s finished


In [17]:
#bayes opt
def XGB_CV(
          max_depth,
          gamma,
          min_child_weight,
          max_delta_step,
          subsample,
          colsample_bytree
         ):

    global AUCbest
    global ITERbest

#
# Define all XGboost parameters
#

    paramt = {
              'booster' : 'gbtree',
              'max_depth' : int(max_depth),
              'gamma' : gamma,
              'eta' : 0.1,
              'objective' : 'binary:logistic',
              'nthread' : 4,
              'silent' : True,
              'eval_metric': 'auc',
              'subsample' : max(min(subsample, 1), 0),
              'colsample_bytree' : max(min(colsample_bytree, 1), 0),
              'min_child_weight' : min_child_weight,
              'max_delta_step' : int(max_delta_step),
              'seed' : 1001
              }

    folds = 5
    cv_score = 0

    print("\n Search parameters (%d-fold validation):\n %s" % (folds, paramt), file=log_file )
    log_file.flush()

    xgbc = xgb.cv(
                    paramt,
                    dtrain,
                    num_boost_round = 20000,
                    stratified = True,
                    nfold = folds,
#                    verbose_eval = 10,
                    early_stopping_rounds = 100,
                    metrics = 'auc',
                    show_stdv = True
               )

# This line would have been on top of this section
#    with capture() as result:

# After xgb.cv is done, this section puts its output into log file. Train and validation scores 
# are also extracted in this section. Note the "diff" part in the printout below, which is the 
# difference between the two scores. Large diff values may indicate that a particular set of 
# parameters is overfitting, especially if you check the CV portion of it in the log file and find 
# out that train scores were improving much faster than validation scores.

#    print('', file=log_file)
#    for line in result[1]:
#        print(line, file=log_file)
#    log_file.flush()

    val_score = xgbc['test-auc-mean'].iloc[-1]
    train_score = xgbc['train-auc-mean'].iloc[-1]
    print(' Stopped after %d iterations with train-auc = %f val-auc = %f ( diff = %f ) train-gini = %f val-gini = %f' % ( len(xgbc), train_score, val_score, (train_score - val_score), (train_score*2-1),
(val_score*2-1)) )
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(xgbc)

    return (val_score*2) - 1


In [20]:
log_file = open('Porto-AUC-5fold-XGB-run-01-v1-full.log', 'a')
AUCbest = -1.
ITERbest = 0

dtrain = xgb.DMatrix(X_train, label = y_train)

XGB_BO = BayesianOptimization(XGB_CV, {
                                     'max_depth': (2, 12),
                                     'gamma': (0.001, 10.0),
                                     'min_child_weight': (0, 20),
                                     'max_delta_step': (0, 10),
                                     'subsample': (0.4, 1.0),
                                     'colsample_bytree' :(0.4, 1.0)
                                    })

In [23]:
# XGB_BO.explore({
#               'max_depth':            [3, 8, 3, 8, 8, 3, 8, 3],
#               'gamma':                [0.5, 8, 0.2, 9, 0.5, 8, 0.2, 9],
#               'min_child_weight':     [0.2, 0.2, 0.2, 0.2, 12, 12, 12, 12],
#               'max_delta_step':       [1, 2, 2, 1, 2, 1, 1, 2],
#               'subsample':            [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
#               'colsample_bytree':     [0.6, 0.8, 0.6, 0.8, 0.6, 0.8, 0.6, 0.8],
#               })


In [26]:
XGB_BO.maximize(init_points=2, n_iter=5, acq='ei', xi=0.0)

print('Final Results')
print('Maximum XGBOOST value: %f' % XGB_BO.res['max']['max_val'])
print('Best XGBOOST parameters: ', XGB_BO.res['max']['max_params'])
print('-'*130, file=log_file)
print('Final Result:', file=log_file)
print('Maximum XGBOOST value: %f' % XGB_BO.res['max']['max_val'], file=log_file)
print('Best XGBOOST parameters: ', XGB_BO.res['max']['max_params'], file=log_file)

history_df = pd.DataFrame(XGB_BO.res['all']['params'])
history_df2 = pd.DataFrame(XGB_BO.res['all']['values'])
history_df = pd.concat((history_df, history_df2), axis=1)
history_df.rename(columns = { 0 : 'gini'}, inplace=True)
history_df['AUC'] = ( history_df['gini'] + 1 ) / 2

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_delta_step |   max_depth |   min_child_weight |   subsample | 
 Stopped after 64 iterations with train-auc = 0.976900 val-auc = 0.972154 ( diff = 0.004745 ) train-gini = 0.953800 val-gini = 0.944309
    1 | 00m04s | [35m   0.94431[0m | [32m            0.6000[0m | [32m   0.5000[0m | [32m          1.0000[0m | [32m     3.0000[0m | [32m            0.2000[0m | [32m     0.6000[0m | 
 Stopped after 41 iterations with train-auc = 0.982752 val-auc = 0.973108 ( diff = 0.009644 ) train-gini = 0.965503 val-gini = 0.946216
    2 | 00m09s | [35m   0.94622[0m | [32m            0.8000[0m | [32m   8.0000[0m | [32m          2.0000[0m | [32m     8.0000[0m | [32m            0.2000[0m | [32m     0.8000[0m | 
 Stopped after 65 iterations with train-au

  " state: %s" % convergence_dict)


 Stopped after 77 iterations with train-auc = 0.979714 val-auc = 0.972963 ( diff = 0.006751 ) train-gini = 0.959428 val-gini = 0.945925
   12 | 00m52s |    0.94593 |             0.9854 |    9.9495 |           0.5661 |     11.6228 |            14.8761 |      0.9928 | 


  " state: %s" % convergence_dict)


 Stopped after 109 iterations with train-auc = 0.974641 val-auc = 0.971362 ( diff = 0.003280 ) train-gini = 0.949283 val-gini = 0.942724
   13 | 01m37s |    0.94272 |             0.8768 |    9.5930 |           2.3707 |      2.0212 |            19.8594 |      0.9324 | 
 Stopped after 68 iterations with train-auc = 0.978659 val-auc = 0.972368 ( diff = 0.006291 ) train-gini = 0.957318 val-gini = 0.944736
   14 | 01m13s |    0.94474 |             0.8413 |    0.1335 |           6.7944 |      3.9460 |             0.1392 |      0.9714 | 


  " state: %s" % convergence_dict)


 Stopped after 63 iterations with train-auc = 0.980602 val-auc = 0.972782 ( diff = 0.007821 ) train-gini = 0.961205 val-gini = 0.945564
   15 | 00m54s |    0.94556 |             0.9984 |    9.8275 |           0.4670 |     10.9794 |             2.0849 |      0.4713 | 


  " state: %s" % convergence_dict)


Final Results
Maximum XGBOOST value: 0.946425
Best XGBOOST parameters:  {'colsample_bytree': 0.8, 'max_delta_step': 1.0, 'min_child_weight': 0.2, 'subsample': 0.8, 'max_depth': 8.0, 'gamma': 9.0}
