# Parameter Tuning using XGBoost

## Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV 
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

## Importing the dataset

In [5]:
dataset = pd.read_csv('data.csv')
X_train = dataset.iloc[:4751, [0,1]]
y_train = dataset.iloc[:4751, 2]
X_test = dataset.iloc[4751:, [0,1]]
y_test = dataset.iloc[4751:, 2]
print(len(X_train))
print(len(X_test))

4751
1584


## Function : Create XGBoost models and perform cross-validation

In [6]:
def modelfit(alg, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Training Accuracy : %.4g" % metrics.accuracy_score(y_train.values, dtrain_predictions))
    print ("Confusion Matrix for Train : ",metrics.confusion_matrix(y_train.values.tolist(), dtrain_predictions.tolist()))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, dtrain_predprob))
                    
    dtest_predictions = alg.predict(X_test)
    dtest_predprob = alg.predict_proba(X_test)[:,1]
    print ("Test Accuracy : %.4g" % metrics.accuracy_score(y_test.values, dtest_predictions))
    print ("Confusion Matrix for Test : ",metrics.confusion_matrix(y_test.values.tolist(), dtest_predictions.tolist()))
    print ("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, dtest_predprob))

## Find the number of estimators for a high learning rate = 0.1

In [7]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1)


Model Report
Training Accuracy : 0.8918
Confusion Matrix for Train :  [[2097  278]
 [ 236 2140]]
AUC Score (Train): 0.962408
Test Accuracy : 0.5019
Confusion Matrix for Test :  [[795   0]
 [789   0]]
AUC Score (Test): 0.499837


  if diff:
  if diff:


# Tune parameters :
      - min_child_weight [default=1]
Defines the minimum sum of weights of all observations required in a child.
This is similar to min_child_leaf in GBM but not exactly. This refers to min “sum of weights” of observations while GBM has min “number of observations”.
Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
Too high values can lead to under-fitting hence, it should be tuned using CV.
      - max_depth [default=6]
The maximum depth of a tree, same as GBM.
Used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.
Should be tuned using CV.
Typical values: 3-10

In [8]:
param_test1 = {
    'max_depth':[3,5,7,9],
    'min_child_weight':[1,3,5]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
                       param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=140,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

### Find the best values for these parameters

In [9]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.52774, std: 0.01210, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.54045, std: 0.01407, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.52816, std: 0.01841, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.54001, std: 0.02165, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.54137, std: 0.03514, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.55900, std: 0.05636, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.52744, std: 0.02917, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.53959, std: 0.02696, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.54945, std: 0.04618, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.53576, std: 0.02562, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.53831, std: 0.02415, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.55463, std: 0.04144, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 5, 'min_child_weight': 5

### Values found for
 - max_depth : 5
 - min_child_weight : 5
 
We’ll search for values 1 above and below the optimum values because we took an interval of two.

In [10]:
param_test2 = {
    'max_depth':[4,5,6],
    'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=2, missing=None, n_estimators=140,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'max_depth': [4, 5, 6], 'min_child_weight': [4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

### Find the best values for these parameters

In [11]:
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.54403, std: 0.04351, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: 0.54346, std: 0.04918, params: {'max_depth': 4, 'min_child_weight': 5},
  mean: 0.54749, std: 0.04520, params: {'max_depth': 4, 'min_child_weight': 6},
  mean: 0.53697, std: 0.02918, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: 0.55900, std: 0.05636, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.54870, std: 0.04701, params: {'max_depth': 5, 'min_child_weight': 6},
  mean: 0.54540, std: 0.04090, params: {'max_depth': 6, 'min_child_weight': 4},
  mean: 0.55273, std: 0.05221, params: {'max_depth': 6, 'min_child_weight': 5},
  mean: 0.55549, std: 0.04268, params: {'max_depth': 6, 'min_child_weight': 6}],
 {'max_depth': 5, 'min_child_weight': 5},
 0.5590018138224819)

## Tune parameter :
    gamma [default=0]
A node is split only when the resulting split gives a positive reduction in the loss function. Gamma specifies the minimum loss reduction required to make a split.
Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.

In [12]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=5, missing=None, n_estimators=140,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

### Find the best value for this parameter

In [13]:
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.55900, std: 0.05636, params: {'gamma': 0.0},
  mean: 0.55220, std: 0.04826, params: {'gamma': 0.1},
  mean: 0.54786, std: 0.05588, params: {'gamma': 0.2},
  mean: 0.55177, std: 0.05480, params: {'gamma': 0.3},
  mean: 0.55240, std: 0.05388, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 0.5590018138224819)

## Re-calibrate the number of boosting rounds for the updated parameters.

In [14]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=5,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2)


Model Report
Training Accuracy : 0.8931
Confusion Matrix for Train :  [[2109  266]
 [ 242 2134]]
AUC Score (Train): 0.964310
Test Accuracy : 0.4981
Confusion Matrix for Test :  [[384 411]
 [384 405]]
AUC Score (Test): 0.507337


  if diff:
  if diff:


## Tune parameters :
 - subsample [default=1]
 
Same as the subsample of GBM. Denotes the fraction of observations to be randomly samples for each tree.
Lower values make the algorithm more conservative and prevents overfitting but too small values might lead to under-fitting.
Typical values: 0.5-1
 - colsample_bytree [default=1]
 
Similar to max_features in GBM. Denotes the fraction of columns to be randomly samples for each tree.
Typical values: 0.5-1

In [15]:
param_test4 = {
 'subsample':[i/10.0 for i in range(1,6)],
 'colsample_bytree':[i/10.0 for i in range(1,6)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=5, missing=None, n_estimators=177,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'subsample': [0.1, 0.2, 0.3, 0.4, 0.5], 'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

### Find the best values for these parameters

In [16]:
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.52957, std: 0.02041, params: {'colsample_bytree': 0.1, 'subsample': 0.1},
  mean: 0.52419, std: 0.02506, params: {'colsample_bytree': 0.1, 'subsample': 0.2},
  mean: 0.52938, std: 0.03721, params: {'colsample_bytree': 0.1, 'subsample': 0.3},
  mean: 0.53954, std: 0.03031, params: {'colsample_bytree': 0.1, 'subsample': 0.4},
  mean: 0.52878, std: 0.03035, params: {'colsample_bytree': 0.1, 'subsample': 0.5},
  mean: 0.52957, std: 0.02041, params: {'colsample_bytree': 0.2, 'subsample': 0.1},
  mean: 0.52419, std: 0.02506, params: {'colsample_bytree': 0.2, 'subsample': 0.2},
  mean: 0.52938, std: 0.03721, params: {'colsample_bytree': 0.2, 'subsample': 0.3},
  mean: 0.53954, std: 0.03031, params: {'colsample_bytree': 0.2, 'subsample': 0.4},
  mean: 0.52878, std: 0.03035, params: {'colsample_bytree': 0.2, 'subsample': 0.5},
  mean: 0.52957, std: 0.02041, params: {'colsample_bytree': 0.3, 'subsample': 0.1},
  mean: 0.52419, std: 0.02506, params: {'colsample_bytree': 0.3, 'subsample'

### Values found for :
 - colsample_bytree : 0.1
 
 - subsample : 0.4
 
We search again within smaller ranges of these values

In [17]:
param_test5 = {
 'subsample':[i/100.0 for i in range(31,49)],
 'colsample_bytree':[i/100.0 for i in range(1,15)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.53110, std: 0.04119, params: {'colsample_bytree': 0.01, 'subsample': 0.31},
  mean: 0.53116, std: 0.03836, params: {'colsample_bytree': 0.01, 'subsample': 0.32},
  mean: 0.54102, std: 0.03774, params: {'colsample_bytree': 0.01, 'subsample': 0.33},
  mean: 0.53241, std: 0.02749, params: {'colsample_bytree': 0.01, 'subsample': 0.34},
  mean: 0.52512, std: 0.03791, params: {'colsample_bytree': 0.01, 'subsample': 0.35},
  mean: 0.52654, std: 0.03452, params: {'colsample_bytree': 0.01, 'subsample': 0.36},
  mean: 0.54073, std: 0.03253, params: {'colsample_bytree': 0.01, 'subsample': 0.37},
  mean: 0.54609, std: 0.03266, params: {'colsample_bytree': 0.01, 'subsample': 0.38},
  mean: 0.54474, std: 0.04573, params: {'colsample_bytree': 0.01, 'subsample': 0.39},
  mean: 0.53954, std: 0.03031, params: {'colsample_bytree': 0.01, 'subsample': 0.4},
  mean: 0.53912, std: 0.03441, params: {'colsample_bytree': 0.01, 'subsample': 0.41},
  mean: 0.54979, std: 0.03874, params: {'colsample_bytr

### Values found for :
 - colsample_bytree : 0.01
 
 - subsample : 0.42
 
We search again within smaller ranges of colsample_bytree

In [26]:
param_test5b = {
 'colsample_bytree':[1e-4,1e-3,1e-2,1e-1,2e-1]
}
gsearch5b = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.42, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5b.fit(X_train,y_train)
gsearch5b.grid_scores_, gsearch5b.best_params_, gsearch5b.best_score_

([mean: 0.54979, std: 0.03874, params: {'colsample_bytree': 0.0001},
  mean: 0.54979, std: 0.03874, params: {'colsample_bytree': 0.001},
  mean: 0.54979, std: 0.03874, params: {'colsample_bytree': 0.01},
  mean: 0.54979, std: 0.03874, params: {'colsample_bytree': 0.1},
  mean: 0.54979, std: 0.03874, params: {'colsample_bytree': 0.2}],
 {'colsample_bytree': 0.0001},
 0.5497949160827766)

## Tune regularization parameter :
  - alpha [default=0]
  
L1 regularization term on weight (analogous to Lasso regression)

Can be used in case of very high dimensionality so that the algorithm runs faster when implemented

In [19]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
                                        min_child_weight=5, gamma=0, subsample=0.42, colsample_bytree=0.01,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(X_train,y_train)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.54979, std: 0.03874, params: {'reg_alpha': 1e-05},
  mean: 0.54490, std: 0.03643, params: {'reg_alpha': 0.01},
  mean: 0.54655, std: 0.02780, params: {'reg_alpha': 0.1},
  mean: 0.54196, std: 0.03621, params: {'reg_alpha': 1},
  mean: 0.50000, std: 0.00000, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.5497949160827766)

### Value found for :
 - reg_alpha : 1e-5
 
We search again within smaller ranges of this parameter

In [20]:
param_test7 = {
 'reg_alpha':[0, 1e-6, 5e-6, 1e-5, 5e-5]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.01, colsample_bytree=0.42,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch7.fit(X_train,y_train)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

([mean: 0.50208, std: 0.02488, params: {'reg_alpha': 0},
  mean: 0.50208, std: 0.02488, params: {'reg_alpha': 1e-06},
  mean: 0.50208, std: 0.02488, params: {'reg_alpha': 5e-06},
  mean: 0.50208, std: 0.02488, params: {'reg_alpha': 1e-05},
  mean: 0.50208, std: 0.02488, params: {'reg_alpha': 5e-05}],
 {'reg_alpha': 0},
 0.5020831024930748)

## Tune parameter :
 - reg_lambda [default=1]
 
L2 regularization term on weights (analogous to Ridge regression)

This used to handle the regularization part of XGBoost. Though many data scientists don’t use it often, it should be explored to reduce overfitting.

In [21]:
param_test8 = {
    'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch8 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
                                        min_child_weight=5, gamma=0, subsample=0.42, colsample_bytree=0.01,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test8, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch8.fit(X_train,y_train)
gsearch8.grid_scores_, gsearch8.best_params_, gsearch8.best_score_

([mean: 0.53992, std: 0.03553, params: {'reg_lambda': 1e-05},
  mean: 0.55079, std: 0.04028, params: {'reg_lambda': 0.01},
  mean: 0.54042, std: 0.03770, params: {'reg_lambda': 0.1},
  mean: 0.54979, std: 0.03874, params: {'reg_lambda': 1},
  mean: 0.52032, std: 0.01014, params: {'reg_lambda': 100}],
 {'reg_lambda': 0.01},
 0.5507912139481832)

### Value found for :
 - reg_lambda : 1e-2
 
We search again within smaller ranges of this parameter

In [28]:
param_test8b = {
 'reg_lambda':[i/1000.0 for i in range(1,15)]
}
gsearch8b = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=5,
 min_child_weight=5, gamma=0, subsample=0.42, colsample_bytree=0.01,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test8b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch8b.fit(X_train,y_train)
gsearch8b.grid_scores_, gsearch8b.best_params_, gsearch8b.best_score_

([mean: 0.53976, std: 0.03648, params: {'reg_lambda': 0.001},
  mean: 0.53992, std: 0.03464, params: {'reg_lambda': 0.002},
  mean: 0.53948, std: 0.03400, params: {'reg_lambda': 0.003},
  mean: 0.54078, std: 0.03185, params: {'reg_lambda': 0.004},
  mean: 0.54193, std: 0.03186, params: {'reg_lambda': 0.005},
  mean: 0.54170, std: 0.03185, params: {'reg_lambda': 0.006},
  mean: 0.53942, std: 0.03024, params: {'reg_lambda': 0.007},
  mean: 0.54457, std: 0.03241, params: {'reg_lambda': 0.008},
  mean: 0.54932, std: 0.03894, params: {'reg_lambda': 0.009},
  mean: 0.55079, std: 0.04028, params: {'reg_lambda': 0.01},
  mean: 0.55167, std: 0.03999, params: {'reg_lambda': 0.011},
  mean: 0.54996, std: 0.04045, params: {'reg_lambda': 0.012},
  mean: 0.54722, std: 0.04128, params: {'reg_lambda': 0.013},
  mean: 0.54832, std: 0.04178, params: {'reg_lambda': 0.014}],
 {'reg_lambda': 0.011},
 0.551665668195256)

## Apply this regularization in the model

In [30]:
xgb3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=5,
 gamma=0,
 subsample=0.42,
 colsample_bytree=0.01,
 reg_alpha=0,
 reg_lambda = 0.011,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb3)


Model Report
Training Accuracy : 0.8872
Confusion Matrix for Train :  [[2094  281]
 [ 255 2121]]
AUC Score (Train): 0.959033
Test Accuracy : 0.4924
Confusion Matrix for Test :  [[318 477]
 [327 462]]
AUC Score (Test): 0.508448


  if diff:
  if diff:


## Reducing Learning Rate to 0.01

In [31]:
xgb4 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=5,
 gamma=0,
 subsample=0.42,
 colsample_bytree=0.01,
 reg_alpha=0,
 reg_lambda = 0.011,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb4)


Model Report
Training Accuracy : 0.848
Confusion Matrix for Train :  [[1993  382]
 [ 340 2036]]
AUC Score (Train): 0.925536
Test Accuracy : 0.5025
Confusion Matrix for Test :  [[243 552]
 [236 553]]
AUC Score (Test): 0.510305


  if diff:
  if diff:


## Conclusion : 

### Training accuracy found : 84.8%

### Test accuracy found : 50.25%