Learn XGBoost with [A Guide to Gradient Boosted Trees with XGBoost in Python](https://jessesw.com/XG-Boost/)

In [140]:
import numpy as np
import pandas as pd

In [141]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
                      skiprows = 1, header = None) # Make sure to skip a row for the test set

In [142]:
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [143]:
test_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [144]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
              'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
             'wage_class']

In [145]:
test_set.columns = col_labels
train_set.columns = col_labels

In [146]:
train_set.info() # note that Pandas will not take '?' as nan automatically, use na_values to specify it
# By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, 
# ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
wage_class        32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Remove rows containing unknown values (" ?")

In [147]:
train_set = train_set.replace(" ?", np.nan).dropna()
test_set = test_set.replace(" ?", np.nan).dropna()
print(train_set.shape, test_set.shape)

(30162, 15) (15060, 15)


In [148]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.


In [149]:
# note that the wage_class in the test set has an additional dot after its value
test_set['wage_class'] = test_set['wage_class'].replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})
test_set['wage_class'].unique()

array([' <=50K', ' >50K'], dtype=object)

## Applying Ordinal Encoding to Categoricals
All called numeric encoding. That is, assign a unique number to each category. 

In [150]:
combine_set = pd.concat([train_set, test_set])
combine_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 16280
Data columns (total 15 columns):
age               45222 non-null int64
workclass         45222 non-null object
fnlwgt            45222 non-null int64
education         45222 non-null object
education_num     45222 non-null int64
marital_status    45222 non-null object
occupation        45222 non-null object
relationship      45222 non-null object
race              45222 non-null object
sex               45222 non-null object
capital_gain      45222 non-null int64
capital_loss      45222 non-null int64
hours_per_week    45222 non-null int64
native_country    45222 non-null object
wage_class        45222 non-null object
dtypes: int64(6), object(9)
memory usage: 5.5+ MB


In [151]:
# how to use numerical encoding?
pd.Categorical(['a', 'b', 'c', 'a']).codes # start from 0

array([0, 1, 2, 0], dtype=int8)

In [152]:
for feature in combine_set.columns:
    if combine_set[feature].dtype == 'object':
        combine_set[feature] = pd.Categorical(combine_set[feature]).codes

In [153]:
combine_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 16280
Data columns (total 15 columns):
age               45222 non-null int64
workclass         45222 non-null int8
fnlwgt            45222 non-null int64
education         45222 non-null int8
education_num     45222 non-null int64
marital_status    45222 non-null int8
occupation        45222 non-null int8
relationship      45222 non-null int8
race              45222 non-null int8
sex               45222 non-null int8
capital_gain      45222 non-null int64
capital_loss      45222 non-null int64
hours_per_week    45222 non-null int64
native_country    45222 non-null int8
wage_class        45222 non-null int8
dtypes: int64(6), int8(9)
memory usage: 2.8 MB


In [154]:
train_set_ne = combine_set[0:train_set.shape[0]]
test_set_ne = combine_set[train_set.shape[0]:]

## Apply one-hot encoding to the categorical variables

In [155]:
combine_set = pd.concat([train_set, test_set])
combine_set = pd.get_dummies(combine_set) # all columns of type object or catagory will be encoded

In [156]:
combine_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45222 entries, 0 to 16280
Columns: 106 entries, age to wage_class_ >50K
dtypes: int64(6), uint8(100)
memory usage: 6.7 MB


In [157]:
train_set_oe = combine_set[0:train_set.shape[0]]
test_set_oe = combine_set[train_set.shape[0]:]

## Initial Model Setup and Grid Search Cross-validation for model selection

In [158]:
train_y = train_set_ne.pop('wage_class')
test_y = test_set_ne.pop('wage_class')
train_x = train_set_ne
test_x = test_set_ne

In [159]:
train_y.unique() # codes has already turned the target variable into 0 and 1

array([0, 1], dtype=int64)

In [160]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [161]:
# for xgboost, we tune the following two parameters, both controlling the individual tree complexity
param_grid = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]}
# these parameters are specified 
params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic'}
# the binary logistic objective is defined by the loss function
# L(y, f(x)) = -log(p(y|x)) = - (ylog(p(x)) + (1-y)log(1-p(x))), where y is the binary target {0, 1}

In [162]:
# define the grid search class
# fivefold cross-validation and use parallel training
gs = GridSearchCV(xgb.XGBClassifier(**params), param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs.fit(train_x, train_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=0.8),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [163]:
pd.DataFrame(gs.cv_results_)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_min_child_weight,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,18.9157,0.239664,0.867118,0.895813,3,1,"{'max_depth': 3, 'min_child_weight': 1}",1,0.865904,0.895105,...,0.869717,0.895603,0.868866,0.89511,0.867684,0.896647,0.270182,0.004872,0.002249,0.000686
1,19.44254,0.244267,0.866587,0.893343,3,3,"{'max_depth': 3, 'min_child_weight': 3}",2,0.861429,0.893696,...,0.867396,0.893613,0.872016,0.892831,0.866357,0.892835,0.328272,0.012941,0.003391,0.000418
2,19.387035,0.242265,0.866587,0.891627,3,5,"{'max_depth': 3, 'min_child_weight': 5}",2,0.863915,0.89208,...,0.86723,0.890961,0.87185,0.89138,0.866191,0.891592,0.271457,0.007643,0.002948,0.000437
3,30.380039,0.39497,0.862144,0.941375,5,1,"{'max_depth': 5, 'min_child_weight': 1}",4,0.86176,0.94144,...,0.863584,0.942434,0.864556,0.940323,0.862046,0.941237,0.293196,0.02007,0.001969,0.000672
4,30.787117,0.392168,0.861614,0.930277,5,3,"{'max_depth': 5, 'min_child_weight': 3}",6,0.859606,0.929007,...,0.86176,0.930872,0.863395,0.929258,0.862875,0.930173,0.576656,0.014007,0.001429,0.001117
5,30.012888,0.391968,0.862078,0.922634,5,5,"{'max_depth': 5, 'min_child_weight': 5}",5,0.859606,0.920179,...,0.86176,0.924572,0.866545,0.921177,0.861549,0.922548,0.467101,0.010252,0.002356,0.001798
6,41.941632,0.675761,0.856508,0.984998,7,1,"{'max_depth': 7, 'min_child_weight': 1}",8,0.853638,0.984832,...,0.857616,0.984707,0.858588,0.984501,0.857569,0.98504,0.867556,0.061276,0.001834,0.000488
7,41.645029,0.60081,0.855746,0.969523,7,3,"{'max_depth': 7, 'min_child_weight': 3}",9,0.856622,0.968171,...,0.856125,0.969083,0.859582,0.968753,0.854087,0.970702,1.134703,0.075386,0.002456,0.001088
8,36.127976,0.514251,0.856939,0.956021,7,5,"{'max_depth': 7, 'min_child_weight': 5}",7,0.858114,0.956277,...,0.859937,0.956111,0.859748,0.955077,0.856409,0.955659,0.289997,0.060916,0.003468,0.000635


In [164]:
gs.best_score_

0.86711756514819971

### Try to tune other parameters by cv

In [165]:
cv_params = {'learning_rate': [0.1, 0.01, 0.05], 'subsample': [0.7,0.8,0.9]}
gs2 = GridSearchCV(gs.best_estimator_, cv_params, cv=5, scoring='accuracy', n_jobs=-1)
gs2.fit(train_x, train_y)
# report
pd.DataFrame(gs2.cv_results_).sort_values('mean_test_score')



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_subsample,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
5,19.513421,0.23376,0.860255,0.863106,0.01,0.9,"{'learning_rate': 0.01, 'subsample': 0.9}",9,0.855793,0.86481,...,0.86176,0.86195,0.864721,0.86179,0.862212,0.863039,0.347558,0.005673,0.003405,0.001155
4,19.763291,0.231058,0.860288,0.863512,0.01,0.8,"{'learning_rate': 0.01, 'subsample': 0.8}",8,0.85629,0.865556,...,0.861926,0.862821,0.863395,0.861956,0.862709,0.863371,0.368974,0.006979,0.002974,0.001201
3,19.566757,0.235861,0.86052,0.863587,0.01,0.7,"{'learning_rate': 0.01, 'subsample': 0.7}",7,0.857285,0.865224,...,0.862092,0.862116,0.863727,0.862329,0.862709,0.8642,0.390065,0.008292,0.002896,0.001186
0,19.037435,0.247769,0.866222,0.895108,0.1,0.7,"{'learning_rate': 0.1, 'subsample': 0.7}",6,0.863418,0.895769,...,0.866236,0.894152,0.869529,0.894903,0.865362,0.895363,0.47987,0.009373,0.001983,0.000551
1,19.543141,0.239364,0.867118,0.895813,0.1,0.8,"{'learning_rate': 0.1, 'subsample': 0.8}",5,0.865904,0.895105,...,0.869717,0.895603,0.868866,0.89511,0.867684,0.896647,0.308657,0.009407,0.002249,0.000686
2,19.625697,0.235661,0.867582,0.895332,0.1,0.9,"{'learning_rate': 0.1, 'subsample': 0.9}",4,0.863086,0.895603,...,0.869717,0.896017,0.87185,0.89482,0.866523,0.894451,0.352307,0.00848,0.002994,0.000595
6,19.354813,0.240964,0.868875,0.885046,0.05,0.7,"{'learning_rate': 0.05, 'subsample': 0.7}",3,0.864744,0.886154,...,0.871706,0.884662,0.874337,0.883962,0.870337,0.88521,0.3299,0.007787,0.004211,0.000723
7,19.670128,0.255174,0.869538,0.884399,0.05,0.8,"{'learning_rate': 0.05, 'subsample': 0.8}",2,0.86723,0.88491,...,0.87154,0.884123,0.873508,0.883672,0.870005,0.88463,0.315329,0.020321,0.002911,0.000445
8,16.018662,0.193932,0.869637,0.884963,0.05,0.9,"{'learning_rate': 0.05, 'subsample': 0.9}",1,0.866401,0.885739,...,0.870877,0.884206,0.874668,0.884791,0.871829,0.88463,0.674756,0.011779,0.003726,0.000557


In [166]:
gs2.best_params_

{'learning_rate': 0.05, 'subsample': 0.9}

# Early stopping CV
In GBM, how many iterations shall we take?

In [167]:
gs2.best_estimator_.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.05,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 1000,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 0.9}

In [184]:
# XGBoost supports built-in cross-validation to determine early stopping
# http://xgboost.readthedocs.io/en/latest/python/python_api.html
xgbmat = xgb.DMatrix(train_x, train_y)
current_params = params=gs2.best_estimator_.get_params()
current_params['n_jobs'] = -1
current_params['nthread'] = -1
current_params['learning_rate'] = 0.1
current_params['subsample'] = 0.7
early_stopping_gbt = xgb.cv(current_params, dtrain=xgbmat, num_boost_round=1000, 
                            nfold=5, metrics=['error'], # 'error' for binary classification errors
                            early_stopping_rounds=100)

In [185]:
early_stopping_gbt

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.192295,0.009347,0.190985,0.003605
1,0.174557,0.014765,0.172038,0.011635
2,0.166334,0.014616,0.165100,0.011414
3,0.164643,0.014460,0.164280,0.011500
4,0.159704,0.005398,0.159447,0.002519
5,0.159902,0.006612,0.159348,0.002081
6,0.158676,0.004889,0.157558,0.001623
7,0.157748,0.003106,0.157914,0.001202
8,0.157350,0.002752,0.157168,0.001762
9,0.157284,0.002940,0.156546,0.002501


In [186]:
early_stopping_gbt['test-error-mean'].idxmin()

437

In [187]:
# now we get the best set of parameters
current_params['n_estimators'] = 437

In [192]:
final_gbt = xgb.train(current_params, xgbmat, 437)

# Performance on Test Data

In [194]:
test_dmat = xgb.DMatrix(test_x)
from sklearn.metrics import accuracy_score
pred_y = final_gbt.predict(test_dmat)

In [195]:
pred_y # this is p(y=1|x)

array([ 0.00295815,  0.20412157,  0.27673399, ...,  0.81844318,
        0.12009671,  0.7825765 ], dtype=float32)

In [196]:
pred_y[pred_y >= 0.5] = 1
pred_y[pred_y < 0.5] = 0
accuracy_score(pred_y, test_y)

0.86872509960159361