# Problem Statement

## In this assignment students need to predict whether a person makes over 50K per year or not from classic adult dataset using XGBoost.

In [8]:
import numpy as np
import pandas as pd
train_set =pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)

test_set =pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', skiprows = 1, header = None)
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status','occupation','relationship', 
              'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week','native_country', 'wage_class']

train_set.columns = col_labels
test_set.columns = col_labels

In [9]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
wage_class        32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [10]:
train_set.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
wage_class        0
dtype: int64

In [11]:
train_set.shape, test_set.shape

((32561, 15), (16281, 15))

In [12]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [13]:
train_set.wage_class.unique()

array([' <=50K', ' >50K'], dtype=object)

In [14]:
train_set.wage_class = train_set.wage_class.map({' >50K':1,' <=50K':0})

In [15]:
test_set.wage_class.unique()

array([' <=50K.', ' >50K.'], dtype=object)

In [16]:
test_set.wage_class = test_set.wage_class.map({' >50K.':1,' <=50K.':0})

In [17]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


In [18]:
cat_variables = train_set.dtypes[train_set.dtypes == 'object'].index

In [19]:
train_set[cat_variables].nunique()

workclass          9
education         16
marital_status     7
occupation        15
relationship       6
race               5
sex                2
native_country    42
dtype: int64

In [20]:
train = train_set.copy()
test = test_set.copy()

In [21]:
for column in cat_variables:
    freq = train[column].value_counts()/train.shape[0]
    categories_to_combine = freq.loc[freq.values <0.05].index
    
    for cat in categories_to_combine:
        train[column].replace({cat:'Others'},inplace=True)
        test[column].replace({cat:'Others'},inplace=True)

In [22]:
train[cat_variables].nunique()

workclass         5
education         5
marital_status    4
occupation        9
relationship      5
race              3
sex               2
native_country    2
dtype: int64

## Data transformations

In [23]:
data = train.append(test)
data.shape

(48842, 15)

In [24]:
cat_variables

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country'],
      dtype='object')

In [25]:
# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for var in cat_variables:
    train[var] = le.fit_transform(train[var])
    test[var] = le.fit_transform(test[var])

In [27]:
train.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,39,4,77516,0,13,2,1,1,1,1,2174,0,40,0,0
1,50,3,83311,0,13,1,3,0,1,1,0,0,13,0,0
2,38,2,215646,1,9,0,8,1,1,1,0,0,40,0,0
3,53,2,234721,4,7,1,8,0,0,1,0,0,40,0,0
4,28,2,338409,0,13,1,6,4,0,0,0,0,40,1,0
5,37,2,284582,2,14,1,3,4,1,0,0,0,40,0,0
6,49,2,160187,4,5,3,5,1,0,0,0,0,16,1,0
7,52,3,209642,1,9,1,3,0,1,1,0,0,45,0,1
8,31,2,45781,2,14,2,6,1,1,0,14084,0,50,0,1
9,42,2,159449,0,13,1,3,0,1,1,5178,0,40,0,1


In [28]:
test.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,2,226802,4,7,2,4,2,0,1,0,0,40,0,0
1,38,2,89814,1,9,1,8,0,1,1,0,0,50,0,0
2,28,1,336951,4,12,1,8,0,1,1,0,0,40,0,1
3,44,2,160323,3,10,1,4,0,0,1,7688,0,40,0,1
4,18,0,103497,3,10,2,0,2,1,0,0,0,30,0,0
5,34,2,198693,4,6,2,5,1,1,1,0,0,30,0,0
6,29,0,227026,1,9,2,0,3,0,1,0,0,40,0,0
7,63,3,104626,4,15,1,6,0,1,1,3103,0,32,0,1
8,24,2,369667,3,10,2,5,3,1,0,0,0,40,0,0
9,55,2,104996,4,4,1,2,0,1,1,0,0,10,0,0


In [29]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV, train_test_split

In [30]:
import xgboost as xgb
from sklearn import metrics
target = 'wage_class'
predictors = [x for x in train.columns if x not in [target]]
dtrain = xgb.DMatrix(train[predictors],label=train[target])
dtest = xgb.DMatrix(test[predictors],label = test[target])

In [31]:
# Basic model
params = {
    'objective':'binary:logistic',
    'max_depth':2,
    'silent':1,
    'eta':1
}

num_rounds = 48 # 48

In [32]:
watchlist  = [(dtrain,'train'),(dtest,'test')] # native interface only
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	train-error:0.173766	test-error:0.173208
[1]	train-error:0.159977	test-error:0.16074
[2]	train-error:0.153404	test-error:0.155212
[3]	train-error:0.152821	test-error:0.154229
[4]	train-error:0.152544	test-error:0.153922
[5]	train-error:0.149842	test-error:0.152386
[6]	train-error:0.149658	test-error:0.151403
[7]	train-error:0.148491	test-error:0.149622
[8]	train-error:0.147661	test-error:0.148885
[9]	train-error:0.147078	test-error:0.148271
[10]	train-error:0.145726	test-error:0.145998
[11]	train-error:0.145174	test-error:0.145753
[12]	train-error:0.144621	test-error:0.143787
[13]	train-error:0.142256	test-error:0.142129
[14]	train-error:0.141058	test-error:0.140593
[15]	train-error:0.140291	test-error:0.140593
[16]	train-error:0.140198	test-error:0.140225
[17]	train-error:0.138816	test-error:0.138874
[18]	train-error:0.138509	test-error:0.137645
[19]	train-error:0.137004	test-error:0.136171
[20]	train-error:0.134885	test-error:0.13439
[21]	train-error:0.135254	test-error:0.13439
[

### Evaluate the model

In [33]:
preds_prob = bst.predict(dtest)
preds_prob

array([0.0038235 , 0.23264956, 0.2415131 , ..., 0.83253616, 0.11436808,
       0.8048834 ], dtype=float32)

In [34]:
labels = dtest.get_label()
preds = preds_prob > 0.5 # threshold
correct = 0

for i in range(len(preds)):
    if (labels[i] == preds[i]):
        correct += 1

print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-correct/len(preds)))

Predicted correctly: 14181/16281
Error: 0.1290


### Evaluation of the model

In [35]:
# specify general training parameters
params = {
    'objective':'binary:logistic',
    'max_depth':1,
    'silent':1,
    'eta':0.5
}

num_rounds = 5

In [36]:
watchlist  = [(dtest,'test'), (dtrain,'train')]

In [37]:
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-error:0.236226	train-error:0.24081
[1]	test-error:0.195074	train-error:0.197383
[2]	test-error:0.198759	train-error:0.201437
[3]	test-error:0.196487	train-error:0.198643
[4]	test-error:0.163074	train-error:0.162556


In [38]:
params['eval_metric'] = 'logloss'
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-logloss:0.543497	train-logloss:0.547068
[1]	test-logloss:0.477086	train-logloss:0.480536
[2]	test-logloss:0.440804	train-logloss:0.443902
[3]	test-logloss:0.41467	train-logloss:0.418023
[4]	test-logloss:0.401666	train-logloss:0.404755


In [39]:
params['eval_metric'] = ['logloss', 'auc']
bst = xgb.train(params, dtrain, num_rounds, watchlist)

[0]	test-logloss:0.543497	test-auc:0.736104	train-logloss:0.547068	train-auc:0.730227
[1]	test-logloss:0.477086	test-auc:0.787733	train-logloss:0.480536	train-auc:0.78378
[2]	test-logloss:0.440804	test-auc:0.839844	train-logloss:0.443902	train-auc:0.835836
[3]	test-logloss:0.41467	test-auc:0.851247	train-logloss:0.418023	train-auc:0.849912
[4]	test-logloss:0.401666	test-auc:0.860932	train-logloss:0.404755	train-auc:0.861053


## Simple definition to build a XGB model

In [40]:
dev_X, val_X, dev_y, val_y = train_test_split(train[predictors], train[target], test_size = 0.3, random_state = 42)

In [41]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    dtest1 = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest1, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [42]:
# Training XGB
pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, test[predictors])
print("XGB Training Completed...")

[0]	train-rmse:0.499718	valid-rmse:0.499739
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.472644	valid-rmse:0.473818
[200]	train-rmse:0.448956	valid-rmse:0.451484
[300]	train-rmse:0.427888	valid-rmse:0.431819
[400]	train-rmse:0.409261	valid-rmse:0.414627
[500]	train-rmse:0.392691	valid-rmse:0.399424
[600]	train-rmse:0.378099	valid-rmse:0.386177
[700]	train-rmse:0.365283	valid-rmse:0.374794
[800]	train-rmse:0.354065	valid-rmse:0.365096
[900]	train-rmse:0.344124	valid-rmse:0.35651
[1000]	train-rmse:0.33533	valid-rmse:0.349162
[1100]	train-rmse:0.327648	valid-rmse:0.342881
[1200]	train-rmse:0.32086	valid-rmse:0.337399
[1300]	train-rmse:0.314802	valid-rmse:0.33273
[1400]	train-rmse:0.309405	valid-rmse:0.328581
[1500]	train-rmse:0.304667	valid-rmse:0.325085
[1600]	train-rmse:0.300417	valid-rmse:0.322088
[1700]	train-rmse:0.29665	valid-rmse:0.319548
[1800]	train-rmse:0.29325

## Parameter tuning...

In [43]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], 
                          nfold=cv_folds,metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
                    
'''    feat_imp = pd.DataFrame(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')'''

"    feat_imp = pd.DataFrame(alg.booster().get_fscore()).sort_values(ascending=False)\n    feat_imp.plot(kind='bar', title='Feature Importances')\n    plt.ylabel('Feature Importance Score')"

Step 1: Fix learning rate and number of estimators for tuning tree-based parameters
In order to decide on boosting parameters, we need to set some initial values of other parameters. Lets take the following values:

 max_depth = 5 : This should be between 3-10. I’ve started with 5 but you can choose a different number as well. 4-6 can be good starting points.
 min_child_weight = 1 : A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups.
gamma = 0 : A smaller value like 0.1-0.2 can also be chosen for starting. This will anyways be tuned later.
 subsample, colsample_bytree = 0.8 : This is a commonly used used start value. Typical values range between 0.5-0.9.
scale_pos_weight = 1: Because of high class imbalance.

Please note that all the above are just initial estimates and will be tuned later. Lets take the default learning rate of 0.1 here and check the optimum number of trees using cv function of xgboost. The function defined above will do it for us.

In [44]:
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)


Model Report
Accuracy : 0.885
AUC Score (Train): 0.941992


#### Step 2: Tune max_depth and min_child_weight

We tune these first as they will have the highest impact on model outcome. To start with, let’s set wider ranges and then we will perform another iteration for smaller ranges.

Important Note: I’ll be doing some heavy-duty grid searched in this section which can take 15-30 mins or even more time to run depending on your system. You can vary the number of values you are testing based on what your system can handle.


In [45]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}

gsearch1 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
                                                      min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
                        param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.92060, std: 0.00270, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.92062, std: 0.00259, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.92015, std: 0.00289, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.92537, std: 0.00217, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.92512, std: 0.00204, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.92434, std: 0.00200, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.92523, std: 0.00219, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.92484, std: 0.00229, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.92425, std: 0.00175, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.92294, std: 0.00182, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.92342, std: 0.00145, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.92307, std: 0.00178, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 5, 'min_child_weight': 1

Here, we have run 12 combinations with wider intervals between values. The ideal values are 5 for max_depth and 5 for min_child_weight. Lets go one step deeper and look for optimum values. We’ll search for values 1 above and below the optimum values because we took an interval of two.

In [48]:
param_test2 = {
 'max_depth':[6,7,8],
 'min_child_weight':[0,1,2]
}
gsearch2 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
                                                      min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                      objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch2.fit(train[predictors],train[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.92594, std: 0.00166, params: {'max_depth': 6, 'min_child_weight': 0},
  mean: 0.92593, std: 0.00192, params: {'max_depth': 6, 'min_child_weight': 1},
  mean: 0.92585, std: 0.00179, params: {'max_depth': 6, 'min_child_weight': 2},
  mean: 0.92521, std: 0.00178, params: {'max_depth': 7, 'min_child_weight': 0},
  mean: 0.92523, std: 0.00219, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.92517, std: 0.00196, params: {'max_depth': 7, 'min_child_weight': 2},
  mean: 0.92354, std: 0.00159, params: {'max_depth': 8, 'min_child_weight': 0},
  mean: 0.92410, std: 0.00161, params: {'max_depth': 8, 'min_child_weight': 1},
  mean: 0.92444, std: 0.00199, params: {'max_depth': 8, 'min_child_weight': 2}],
 {'max_depth': 6, 'min_child_weight': 0},
 0.9259398507387264)

In [50]:
param_test2b = {'min_child_weight':[0,1,2,4,5,6]}

gsearch2b = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=200, max_depth=6,
                                                   min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                   objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                         param_grid = param_test2b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch2b.fit(train[predictors],train[target])

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=2, missing=None, n_estimators=200,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27, silent=True,
       subsample=0.8),
       fit_params=None, iid=False, n_jobs=4,
       param_grid={'min_child_weight': [0, 1, 2, 4, 5, 6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [51]:
modelfit(gsearch2b.best_estimator_, train, predictors)

gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_


Model Report
Accuracy : 0.8866
AUC Score (Train): 0.944050


([mean: 0.92516, std: 0.00148, params: {'min_child_weight': 0},
  mean: 0.92531, std: 0.00166, params: {'min_child_weight': 1},
  mean: 0.92532, std: 0.00147, params: {'min_child_weight': 2},
  mean: 0.92485, std: 0.00165, params: {'min_child_weight': 4},
  mean: 0.92439, std: 0.00183, params: {'min_child_weight': 5},
  mean: 0.92394, std: 0.00196, params: {'min_child_weight': 6}],
 {'min_child_weight': 2},
 0.9253196379190302)

#### Step 3: Tune gamma

Now lets tune gamma value using the parameters already tuned above. Gamma can take various values but I’ll check for 5 values here. You can go into more precise values as.

In [52]:
param_test3 = {'gamma':[i/10.0 for i in range(0,5)]}

gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=6,
                                                  min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.92532, std: 0.00147, params: {'gamma': 0.0},
  mean: 0.92547, std: 0.00154, params: {'gamma': 0.1},
  mean: 0.92513, std: 0.00153, params: {'gamma': 0.2},
  mean: 0.92540, std: 0.00143, params: {'gamma': 0.3},
  mean: 0.92535, std: 0.00144, params: {'gamma': 0.4}],
 {'gamma': 0.1},
 0.9254738340399452)

In [53]:
xgb2 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=6,
 min_child_weight=2,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb2, train, predictors)


Model Report
Accuracy : 0.8866
AUC Score (Train): 0.944050


#### Step 4: Tune subsample and colsample_bytree

The next step would be try different subsample and colsample_bytree values. 
Lets do this in 2 stages as well and take values 0.6,0.7,0.8,0.9 for both to start with.

In [54]:
param_test4 = {'subsample':[i/10.0 for i in range(6,10)],
               'colsample_bytree':[i/10.0 for i in range(6,10)]}

gsearch4 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=200, max_depth=6,
                                                  min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch4.fit(train[predictors],train[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.92505, std: 0.00171, params: {'colsample_bytree': 0.6, 'subsample': 0.6},
  mean: 0.92504, std: 0.00202, params: {'colsample_bytree': 0.6, 'subsample': 0.7},
  mean: 0.92577, std: 0.00201, params: {'colsample_bytree': 0.6, 'subsample': 0.8},
  mean: 0.92632, std: 0.00209, params: {'colsample_bytree': 0.6, 'subsample': 0.9},
  mean: 0.92443, std: 0.00163, params: {'colsample_bytree': 0.7, 'subsample': 0.6},
  mean: 0.92487, std: 0.00180, params: {'colsample_bytree': 0.7, 'subsample': 0.7},
  mean: 0.92532, std: 0.00174, params: {'colsample_bytree': 0.7, 'subsample': 0.8},
  mean: 0.92604, std: 0.00165, params: {'colsample_bytree': 0.7, 'subsample': 0.9},
  mean: 0.92427, std: 0.00149, params: {'colsample_bytree': 0.8, 'subsample': 0.6},
  mean: 0.92454, std: 0.00165, params: {'colsample_bytree': 0.8, 'subsample': 0.7},
  mean: 0.92532, std: 0.00147, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.92597, std: 0.00197, params: {'colsample_bytree': 0.8, 'subsample'

In [55]:
param_test5 = {'subsample':[i/100.0 for i in range(75,90,5)],
               'colsample_bytree':[i/100.0 for i in range(75,90,5)]}

gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                                  min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch5.fit(train[predictors],train[target])
gsearch5.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.92415, std: 0.00179, params: {'colsample_bytree': 0.75, 'subsample': 0.75},
  mean: 0.92436, std: 0.00208, params: {'colsample_bytree': 0.75, 'subsample': 0.8},
  mean: 0.92477, std: 0.00202, params: {'colsample_bytree': 0.75, 'subsample': 0.85},
  mean: 0.92410, std: 0.00175, params: {'colsample_bytree': 0.8, 'subsample': 0.75},
  mean: 0.92421, std: 0.00206, params: {'colsample_bytree': 0.8, 'subsample': 0.8},
  mean: 0.92457, std: 0.00193, params: {'colsample_bytree': 0.8, 'subsample': 0.85},
  mean: 0.92410, std: 0.00175, params: {'colsample_bytree': 0.85, 'subsample': 0.75},
  mean: 0.92421, std: 0.00206, params: {'colsample_bytree': 0.85, 'subsample': 0.8},
  mean: 0.92457, std: 0.00193, params: {'colsample_bytree': 0.85, 'subsample': 0.85}],
 {'colsample_bytree': 0.6, 'subsample': 0.9},
 0.9263171613277853)

### Step 5: Tuning Regularization Parameters

Next step is to apply regularization to reduce overfitting. Though many people don’t use this parameters much as gamma provides a substantial way of controlling complexity. But we should always try it. I’ll tune ‘reg_alpha’ value here and leave it upto you to try different values of ‘reg_lambda’.

In [56]:
param_test6 = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}

gsearch6 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=6,
                                                  min_child_weight=2, gamma=0, subsample=0.9, colsample_bytree=0.6,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch6.fit(train[predictors],train[target])
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.92637, std: 0.00210, params: {'reg_alpha': 1e-05},
  mean: 0.92623, std: 0.00208, params: {'reg_alpha': 0.01},
  mean: 0.92622, std: 0.00183, params: {'reg_alpha': 0.1},
  mean: 0.92582, std: 0.00205, params: {'reg_alpha': 1},
  mean: 0.91064, std: 0.00340, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.9263707904695095)

In [57]:
param_test7 = {'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]}

gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=6,
                                                  min_child_weight=2, gamma=0, subsample=0.9, colsample_bytree=0.6,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                        param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

gsearch7.fit(train[predictors],train[target])
gsearch7.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.92637, std: 0.00210, params: {'reg_alpha': 1e-05},
  mean: 0.92623, std: 0.00208, params: {'reg_alpha': 0.01},
  mean: 0.92622, std: 0.00183, params: {'reg_alpha': 0.1},
  mean: 0.92582, std: 0.00205, params: {'reg_alpha': 1},
  mean: 0.91064, std: 0.00340, params: {'reg_alpha': 100}],
 {'reg_alpha': 1e-05},
 0.9263707904695095)

In [58]:
xgb3 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=6,
 min_child_weight=2,
 gamma=0,
 subsample=0.9,
 colsample_bytree=0.6,
 reg_alpha=0.1,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb3, train, predictors)


Model Report
Accuracy : 0.8869
AUC Score (Train): 0.944948


### Step 6: Reducing Learning Rate

Lastly, we should lower the learning rate and add more trees. Lets use the cv function of XGBoost to do the job again.

In [59]:
xgb4 = xgb.XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.9,
 colsample_bytree=0.6,
 reg_alpha=0.1,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(xgb4, train, predictors)


Model Report
Accuracy : 0.8818
AUC Score (Train): 0.938659
