## <center>Hyper-paramter tuning of XGBoost - The Baysian Way</center>

### XGBoost + HyperOpt


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import roc_auc_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import numpy as np
import xgboost as xgb

In [39]:
data = pd.read_csv('Downloads/attrition.csv')

In [40]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [41]:
data['Attrition'] = data['Attrition'].apply(lambda x: 1 if x=='Yes' else 0)

In [42]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [43]:
dummy = pd.get_dummies(data, drop_first=True)

In [44]:
dummy.shape

(1470, 48)

In [45]:
data.shape

(1470, 35)

In [46]:
x = dummy.drop('Attrition', axis=1)
y = dummy.Attrition

In [47]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=42)

In [48]:
# Run an XGBoost model with hyperparmaters that are optimized using hyperopt
# The output of the script are the best hyperparmaters


# Scoring and optimization functions
def score(params):
    print("Training with params: ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dvalid = xgb.DMatrix(test_x, label=test_y)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgb.train(params, dtrain, num_round,
                          evals=watchlist,
                          verbose_eval=None)
    predictions = gbm_model.predict(dvalid,
                                    ntree_limit=gbm_model.best_iteration + 1)
    score = roc_auc_score(test_y, predictions)
    # TODO: Add the importance for the selected features
    print('-------------------------------------------')
    print("\tROC-AUC-SCORE {0}".format(score))
    print('-------------------------------------------')
    print('\n')
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}

def optimize(
             #trials, 
             random_state=101):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, max_evals=2)
    return best



best_hyperparams = optimize()
#print("The best hyperparameters are: ", "\n")
print('\n\n----------------------------------------The best hyperparameters-----------------------------------------------\n')
print(best_hyperparams)

Training with params: 
{'booster': 'gbtree', 'colsample_bytree': 0.75, 'eta': 0.42500000000000004, 'eval_metric': 'auc', 'gamma': 0.65, 'max_depth': 5, 'min_child_weight': 2.0, 'n_estimators': 860.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 101, 'silent': 1, 'subsample': 0.55, 'tree_method': 'exact'}
-------------------------------------------
	ROC-AUC-SCORE 0.7843399482312339
-------------------------------------------


Training with params: 
{'booster': 'gbtree', 'colsample_bytree': 0.8, 'eta': 0.025, 'eval_metric': 'auc', 'gamma': 0.7000000000000001, 'max_depth': 6, 'min_child_weight': 6.0, 'n_estimators': 728.0, 'nthread': 4, 'objective': 'binary:logistic', 'seed': 101, 'silent': 1, 'subsample': 0.9, 'tree_method': 'exact'}
-------------------------------------------
	ROC-AUC-SCORE 0.8263589301121658
-------------------------------------------




----------------------------------------The best hyperparameters-----------------------------------------------

{'colsamp