# Example

# 1.Hyperparameter Tuning

## 1.1  RandomSearch Tuning

In [1]:
def random_tuning(data_train, data_test, predictors, target_label, params_path): 
    '''
    Definition : This function takes several inputs in order to do randomsearch hyperparameter optimization for classification problem
                 by using XGBoost Classifier
    
    Input :
    1.data_train    : data training variable
    2.data_test     : data testing variable
    3.predictors    : list of variable predictors
    4.target_label  : column name of target variable usually in string
    5.params_path   : path to saved the best params
    
    Output :
    1.best_scores   : best performance score depends on what metrics used
    2.best_params   : best hyperparameters result
    '''
    
    param_tuning = {
        'learning_rate': [i/100.0 for i in range(1,21,1)],
        'n_estimators': range(100, 210, 10),
        'max_depth': range(1, 11, 1),
        'gamma': [i/10.0 for i in range(1,10,1)],
        'reg_alpha': [i/1000.0 for i in range(0,100,1)],
        'reg_lambda': [i/100.0 for i in range(0,1000,10)]
    }

    fit_params = {
        'eval_metric': 'auc',
        'early_stopping_rounds': 100,
        'eval_set': [(data_test[predictors], data_test[target_label])],
    }

    ## RANDOMIZED PARAMETER TUNING ##
    XGB_tune = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=(len(data_train[data_train[target_label] == 0])/len(data_train[data_train[target_label] == 1])))
    random_search = RandomizedSearchCV(XGB_tune, param_tuning, n_iter=50, n_jobs=4, cv=10, verbose=2, refit=True, random_state=24)

    print("Randomized Search...")
    search_time_start = time.time()
    random_search.fit(data_train[predictors], data_train[target_label], **fit_params)
    print("Randomized Search Time: ", time.time() - search_time_start)

    best_score = random_search.best_score_
    best_params = random_search.best_params_

    ## SAVE BEST PARAMETERS INTO FILE
    save_params = open(params_path,"w")
    save_params.write(str(best_params))
    save_params.close()

    return best_score, best_params

## 1.2 GridSearch Tuning

In [2]:
def grid_tuning(data_train, data_test, predictors, target_label, params_path):
    '''
    Definition : This function takes several inputs in order to do GridSearch hyperparameter optimization for classification problem
                 by using XGBoost Classifier
    
    Input :
    1.data_train    : data training variable
    2.data_test     : data testing variable
    3.predictors    : list of variable predictors
    4.target_label  : column name of target variable usually in string
    5.params_path   : path to saved the best params
    
    Output :
    1.best_scores   : best performance score depends on what metrics used
    2.best_params   : best hyperparameters result
    '''
    
    param_tuning = {
        'learning_rate': [i/100.0 for i in range(1,11,1)],
        'n_estimators': range(100, 210, 20),
        'max_depth': range(1, 11, 2),
        'gamma': [i/100.0 for i in range(0,100,1)]
    }
    
    ## GRID PARAMETER TUNING ##
    grid_search = GridSearchCV(
        estimator= xgb.XGBClassifier(
            learning_rate=0.1, 
            n_estimators=160, 
            max_depth=5,
            objective= 'binary:logistic', 
            nthread=4, 
            scale_pos_weight=(len(data_train[data_train[target_label] == 0])/len(data_train[data_train[target_label] == 1])), 
            seed=25
        ),
        param_grid = param_tuning, scoring='roc_auc', n_jobs=4, cv=5
    )

    grid_search.fit(data_train[predictors], data_train[target_label])
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_

    ## SAVE BEST PARAMETERS INTO FILE
    save_params = open(params_path,"w")
    save_params.write(str(best_params))
    save_params.close()

    return best_score, best_params

## 1.3 Bayesian Optimization

In [None]:
# Convert to special data format
# https://xgboost.readthedocs.io/en/latest/python/python_intro.html
dtrain = xgb.DMatrix(data_train[predictors], data_train[target_label], feature_names=predictors)

def hyp_xgb(max_depth, subsample, colsample_bytree,min_child_weight, gamma, learning_rate,
            max_delta_step, reg_alpha, reg_lambda, scale_pos_weight):
    params = {
    'objective': 'binary:logistic',
    'eval_metric':'auc',
    'nthread':-1
     }
    
    params['max_depth'] = int(round(max_depth))
    params['subsample'] = max(min(subsample, 1), 0)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['min_child_weight'] = int(min_child_weight)
    params['gamma'] = max(gamma, 0)
    params['learning_rate'] = learning_rate
    params['max_delta_step'] = int(max_delta_step)
    params['reg_alpha'] = reg_alpha
    params['reg_lambda'] = reg_lambda
    params['scale_pos_weight'] = scale_pos_weight
    
    scores = xgb.cv(params, dtrain, num_boost_round=500,verbose_eval=False, 
                    early_stopping_rounds=10, nfold=5)
    return scores['test-auc-mean'].iloc[-1]

In [None]:
pds ={
    'min_child_weight':(3, 20),
    'gamma':(0, 10),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.1, 1),
    'max_depth': (2, 15),
    'learning_rate': (0.01, 0.5),
    'max_delta_step':(1,30),
    'reg_alpha':(0.01,0.4),
    'reg_lambda':(.01,.4),
    'scale_pos_weight':(.1,20)
}

In [None]:
optimizer = BayesianOptimization(hyp_xgb, pds, random_state=1)
optimizer.maximize(init_points=4, n_iter=25)

optimizer.max['params']

best_params_path_bay = "%s_best_params_tuning_bayesian.txt" % str(date.today().strftime("%Y%m%d"))

## SAVE BEST PARAMETERS INTO FILE
save_params = open(best_params_path_bay,"w")
save_params.write(str(optimizer.max['params']))
save_params.close()

In [None]:
#