Learn XGBoost with [A Guide to Gradient Boosted Trees with XGBoost in Python](https://jessesw.com/XG-Boost/)

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
                      skiprows = 1, header = None) # Make sure to skip a row for the test set

In [4]:
train_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
test_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [6]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 
              'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
             'wage_class']

In [7]:
test_set.columns = col_labels
train_set.columns = col_labels

In [8]:
train_set.info() # note that Pandas will not take '?' as nan automatically, use na_values to specify it
# By default the following values are interpreted as NaN: ‘’, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, 
# ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘n/a’, ‘nan’, ‘null’.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education_num     32561 non-null int64
marital_status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital_gain      32561 non-null int64
capital_loss      32561 non-null int64
hours_per_week    32561 non-null int64
native_country    32561 non-null object
wage_class        32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Remove rows containing unknown values (" ?")

In [9]:
train_set = train_set.replace(" ?", np.nan).dropna()
test_set = test_set.replace(" ?", np.nan).dropna()
print(train_set.shape, test_set.shape)

(30162, 15) (15060, 15)


In [10]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.


In [11]:
# note that the wage_class in the test set has an additional dot after its value
test_set['wage_class'] = test_set['wage_class'].replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})
test_set['wage_class'].unique()

array([' <=50K', ' >50K'], dtype=object)

## Applying Ordinal Encoding to Categoricals
All called numeric encoding. That is, assign a unique number to each category. 

In [12]:
combine_set = pd.concat([train_set, test_set])

In [13]:
for feature in combine_set.columns:
    if combine_set[feature].dtype == 'object': # 'category'
        combine_set[feature] = pd.Categorical(combine_set[feature]).codes # code each category from 0

In [27]:
train_set = combine_set[0:train_set.shape[0]]
test_set = combine_set[train_set.shape[0]:]
train_y = train_set.pop('wage_class')
test_y = test_set.pop('wage_class')
train_X = train_set;
test_X = test_set;

In [28]:
# check the class balance
train_y.value_counts()

0    22654
1     7508
Name: wage_class, dtype: int64

In [29]:
test_y.value_counts() # as we can see, the training and test set are imbalanced

0    11360
1     3700
Name: wage_class, dtype: int64

# XGBoost: parameter tuning
Reference: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

## 1. learning rate and number of estimators
These two are usually coupled, i.e., a small learning rate (shrinkage) requires more trees.
We do a grid search coarsely to identify a proper pair of parameters.

In [31]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [32]:
# defaut values for the parameters
xgb.XGBClassifier().get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 1}

In [39]:
param_grid = {'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.3], 'n_estimators': [100, 500, 1000, 2000]}
gs = GridSearchCV(xgb.XGBClassifier(), param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs.fit(train_X, train_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.3], 'n_estimators': [100, 500, 1000, 2000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [40]:
pd.DataFrame(gs.cv_results_).sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_learning_rate,param_n_estimators,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
10,24.967986,0.283092,0.86914,0.882808,0.05,1000,"{'learning_rate': 0.05, 'n_estimators': 1000}",1,0.863252,0.883294,...,0.870545,0.883128,0.875332,0.881434,0.871663,0.881853,1.6533,0.018144,0.004455,0.001045
11,46.909326,0.476222,0.868842,0.892895,0.05,2000,"{'learning_rate': 0.05, 'n_estimators': 2000}",2,0.864412,0.893448,...,0.87038,0.892785,0.871353,0.892996,0.869176,0.892213,1.063263,0.029266,0.002383,0.000403
13,11.57743,0.112976,0.868808,0.883313,0.1,500,"{'learning_rate': 0.1, 'n_estimators': 500}",3,0.863086,0.883626,...,0.869882,0.883584,0.875332,0.882719,0.870171,0.883055,0.2324,0.006943,0.004214,0.000365
14,22.808727,0.252971,0.867814,0.89355,0.1,1000,"{'learning_rate': 0.1, 'n_estimators': 1000}",4,0.865241,0.893945,...,0.869385,0.893945,0.87069,0.893618,0.868513,0.892959,0.84764,0.018229,0.002212,0.000384
16,2.433247,0.023116,0.867714,0.878299,0.3,100,"{'learning_rate': 0.3, 'n_estimators': 100}",5,0.861429,0.879315,...,0.869551,0.877492,0.874005,0.876212,0.869508,0.879077,0.037339,0.003218,0.004447,0.001253
9,13.003094,0.130388,0.867449,0.874909,0.05,500,"{'learning_rate': 0.05, 'n_estimators': 500}",6,0.861926,0.875254,...,0.868722,0.874715,0.873011,0.873311,0.869673,0.875098,0.944458,0.013712,0.004012,0.00093
7,50.13901,0.556176,0.865924,0.872629,0.01,2000,"{'learning_rate': 0.01, 'n_estimators': 2000}",7,0.860434,0.873513,...,0.868391,0.872601,0.87185,0.870908,0.867186,0.872032,3.39928,0.048046,0.004249,0.001118
17,11.278328,0.109574,0.865195,0.902369,0.3,500,"{'learning_rate': 0.3, 'n_estimators': 500}",8,0.861263,0.903021,...,0.868888,0.902482,0.868037,0.902487,0.864367,0.900543,0.374753,0.007104,0.002864,0.000967
15,45.138633,0.465014,0.864764,0.907342,0.1,2000,"{'learning_rate': 0.1, 'n_estimators': 2000}",9,0.860766,0.907995,...,0.867064,0.907373,0.86555,0.906216,0.865528,0.907339,1.377181,0.02523,0.002121,0.000615
18,22.784109,0.232957,0.860652,0.920761,0.3,1000,"{'learning_rate': 0.3, 'n_estimators': 1000}",10,0.858611,0.920842,...,0.864578,0.919889,0.859748,0.920431,0.860222,0.919605,0.509489,0.012887,0.002044,0.001216


The best mean_test_score picks learning_rate=0.05 and n_estimators=1000. Since neither lies on the boundary of our grid search, they are chosen as the best parameters
## 2. Control individual booster (decision tree) complexity
max_depth, min_child_weight and gamma
Because generally boosting decreases bias, we should choose a *weak* base learner, for instance, a shallow decision tree. 

In [42]:
param_grid = {'max_depth': [1, 2, 3, 4, 5], 'min_child_weight': [0.1, 1, 1.5, 5]}
gs2 = GridSearchCV(gs.best_estimator_, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
gs2.fit(train_X, train_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 2, 3, 4, 5], 'min_child_weight': [0.1, 1, 1.5, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [44]:
pd.DataFrame(gs2.cv_results_).sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_max_depth,param_min_child_weight,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
8,22.29804,0.240561,0.869903,0.883173,3,0.1,"{'max_depth': 3, 'min_child_weight': 0.1}",1,0.86491,0.884537,...,0.871706,0.882631,0.876492,0.88218,0.871,0.882102,0.558747,0.003429,0.004313,0.001079
13,29.884425,0.316512,0.869836,0.893997,4,1.0,"{'max_depth': 4, 'min_child_weight': 1}",2,0.86491,0.894857,...,0.871043,0.893986,0.87384,0.893162,0.870337,0.893954,0.564917,0.01385,0.002919,0.000537
14,29.581726,0.297499,0.869505,0.893674,4,1.5,"{'max_depth': 4, 'min_child_weight': 1.5}",3,0.863584,0.894898,...,0.871706,0.892702,0.873674,0.893121,0.870337,0.893581,0.850701,0.013022,0.003453,0.000764
12,30.923218,0.318514,0.869405,0.894337,4,0.1,"{'max_depth': 4, 'min_child_weight': 0.1}",4,0.865407,0.894525,...,0.871208,0.893448,0.871684,0.894861,0.870337,0.893995,0.593056,0.017686,0.002295,0.000546
9,23.050348,0.247366,0.86914,0.882808,3,1.0,"{'max_depth': 3, 'min_child_weight': 1}",5,0.863252,0.883294,...,0.870545,0.883128,0.875332,0.881434,0.871663,0.881853,0.658974,0.021482,0.004455,0.001045
15,31.165689,0.325518,0.869074,0.889165,4,5.0,"{'max_depth': 4, 'min_child_weight': 5}",6,0.862921,0.890091,...,0.871374,0.888599,0.874337,0.888976,0.870005,0.888401,1.168402,0.017469,0.003929,0.000656
19,36.316131,0.38826,0.868941,0.896683,5,5.0,"{'max_depth': 5, 'min_child_weight': 5}",7,0.864578,0.897012,...,0.869054,0.896349,0.87384,0.895814,0.870005,0.89644,1.814589,0.029364,0.003069,0.000675
10,23.154615,0.244064,0.868908,0.882609,3,1.5,"{'max_depth': 3, 'min_child_weight': 1.5}",8,0.863086,0.88317,...,0.871374,0.882672,0.874337,0.881475,0.870337,0.880942,0.90711,0.008874,0.004093,0.001351
17,37.305093,0.412677,0.868808,0.907599,5,1.0,"{'max_depth': 5, 'min_child_weight': 1}",9,0.864247,0.907497,...,0.869551,0.907497,0.873011,0.907004,0.871663,0.907339,1.412363,0.044477,0.003395,0.000559
16,36.803865,0.377653,0.868411,0.909074,5,0.1,"{'max_depth': 5, 'min_child_weight': 0.1}",10,0.865241,0.909279,...,0.869385,0.909196,0.871353,0.910153,0.870005,0.908209,1.066291,0.023622,0.002353,0.000673


In [45]:
gs2.best_params_

{'max_depth': 3, 'min_child_weight': 0.1}

### Tune gamma, penalty of the leaf number
By default, gamma = 0

In [47]:
param_grid3 = {'gamma': [0, 0.1, 0.3, 0.5, 0.7, 1, 2]}
gs3 = GridSearchCV(gs2.best_estimator_, param_grid3, scoring='accuracy', cv=5, n_jobs=-1)
gs3.fit(train_X, train_y)
pd.DataFrame(gs3.cv_results_).sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_gamma,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,22.688645,0.241464,0.869903,0.883173,0.0,{'gamma': 0},1,0.86491,0.884537,0.865407,...,0.871706,0.882631,0.876492,0.88218,0.871,0.882102,0.450435,0.011977,0.004313,0.001079
2,23.113732,0.237661,0.869637,0.883222,0.3,{'gamma': 0.3},2,0.865241,0.884496,0.866236,...,0.870711,0.882589,0.875497,0.882511,0.870502,0.882019,0.446195,0.008258,0.003664,0.001058
1,23.232112,0.246067,0.869505,0.883247,0.1,{'gamma': 0.1},3,0.864744,0.884413,0.865407,...,0.87154,0.883004,0.875,0.882801,0.870834,0.881936,0.511652,0.016109,0.003888,0.000898
4,22.749086,0.212744,0.869306,0.881531,0.7,{'gamma': 0.7},4,0.864578,0.882341,0.865573,...,0.871208,0.882258,0.875995,0.880729,0.869176,0.878331,0.46862,0.020472,0.004115,0.001906
5,22.467395,0.188027,0.869007,0.879542,1.0,{'gamma': 1},5,0.864081,0.878818,0.86491,...,0.871374,0.879191,0.874668,0.878284,0.870005,0.878745,0.396564,0.010959,0.003993,0.001591
3,22.763095,0.23636,0.868742,0.883048,0.5,{'gamma': 0.5},6,0.864081,0.883916,0.86491,...,0.869717,0.882548,0.874337,0.882304,0.870668,0.882226,0.536714,0.008435,0.003805,0.000857
6,18.975042,0.11688,0.867184,0.875124,2.0,{'gamma': 2},7,0.86176,0.874839,0.862589,...,0.869219,0.875212,0.872679,0.874099,0.869673,0.874435,2.944512,0.011456,0.004268,0.001027


## 3. Regulate the data source for each tree (robustness to noise, like bagging)
subsample, colsample_bytree

In [50]:
param_grid4 = {'subsample': np.linspace(0.6, 1, 5), 'colsample_bytree': np.linspace(0.6, 1, 5)}
gs4 = GridSearchCV(gs3.best_estimator_, param_grid4, scoring='accuracy', cv=5, n_jobs=-1)
gs4.fit(train_X, train_y)
pd.DataFrame(gs4.cv_results_).sort_values('mean_test_score', ascending=False)



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_colsample_bytree,param_subsample,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
2,16.799864,0.262578,0.870035,0.884093,0.6,0.8,"{'colsample_bytree': 0.6, 'subsample': 0.8}",1,0.866899,0.884952,...,0.871043,0.883543,0.87616,0.883589,0.870834,0.883884,0.701843,0.02278,0.003795,0.000548
24,22.066933,0.208942,0.869903,0.883173,1.0,1.0,"{'colsample_bytree': 1.0, 'subsample': 1.0}",2,0.86491,0.884537,...,0.871706,0.882631,0.876492,0.88218,0.871,0.882102,2.676483,0.015309,0.004313,0.001079
8,17.648136,0.248268,0.86977,0.884847,0.7,0.9,"{'colsample_bytree': 0.7, 'subsample': 0.9}",3,0.866899,0.885283,...,0.872203,0.884454,0.874171,0.884542,0.86901,0.88463,0.439406,0.019093,0.002979,0.000378
12,21.328025,0.26678,0.869737,0.885104,0.8,0.8,"{'colsample_bytree': 0.8, 'subsample': 0.8}",4,0.86723,0.886112,...,0.872203,0.884952,0.873508,0.884211,0.871166,0.885251,0.812157,0.00634,0.003324,0.000612
13,20.547597,0.25147,0.869604,0.88527,0.8,0.9,"{'colsample_bytree': 0.8, 'subsample': 0.9}",5,0.86607,0.885532,...,0.87038,0.884869,0.875166,0.885081,0.870171,0.884878,0.932614,0.014756,0.003337,0.000432
22,25.049642,0.264079,0.869604,0.885899,1.0,0.8,"{'colsample_bytree': 1.0, 'subsample': 0.8}",5,0.866899,0.886236,...,0.870877,0.885159,0.873176,0.885495,0.869342,0.886287,0.575087,0.012379,0.002249,0.00048
17,22.467595,0.264779,0.869505,0.884938,0.9,0.8,"{'colsample_bytree': 0.9, 'subsample': 0.8}",7,0.86723,0.886319,...,0.870711,0.884827,0.873674,0.883796,0.869342,0.884837,0.647065,0.011185,0.002556,0.000804
23,24.109306,0.233458,0.869472,0.885419,1.0,0.9,"{'colsample_bytree': 1.0, 'subsample': 0.9}",8,0.867064,0.885781,...,0.870048,0.885449,0.873342,0.884915,0.872326,0.885251,0.614378,0.012246,0.003262,0.000314
18,21.404776,0.247267,0.869472,0.885079,0.9,0.9,"{'colsample_bytree': 0.9, 'subsample': 0.9}",8,0.866733,0.885159,...,0.870545,0.884786,0.875497,0.884418,0.869673,0.884795,0.444734,0.01599,0.003628,0.000624
19,20.061768,0.250269,0.869405,0.883123,0.9,1.0,"{'colsample_bytree': 0.9, 'subsample': 1.0}",10,0.865075,0.884662,...,0.870214,0.882962,0.875497,0.882138,0.869342,0.882268,0.446161,0.014698,0.003544,0.000928


As we may notice, compared with the original defautl 1, the two parameters [0.6, 0.8] improve the accuracy by 0.1%. Besidew, if we pay attention to std_test_score, it is also reduced.

## 4. [Optional] Regularize the leaf weights using *reg_lambda* for L2 and *reg_alpha* for L1

## 5. Fine tune the learning rate 
We might choose a large learning rate like 0.1 initially to accelerate the above tuning process, since a high learning rate usually requires a small number of trees.

## 6. Fine tune the nubmer of boosting iterations (trees): early stopping
Generally, we can fix the learning rate and then tune the boosting rounds with XGBoost's built-in cv

In [51]:
xgbmat = xgb.DMatrix(train_X, train_y)
xgb_params =gs4.best_estimator_.get_params()
xgb_params

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.59999999999999998,
 'gamma': 0,
 'learning_rate': 0.05,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 0.1,
 'missing': None,
 'n_estimators': 1000,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 0.80000000000000004}

In [54]:
xgb_params['nthread'] = -1 # avoid error
xgb_params['seed'] = 0
early_stopping_gbt = xgb.cv(xgb_params, dtrain=xgbmat, num_boost_round=2000, 
                            nfold=5, metrics=['error'], # 'error' for binary classification errors
                            early_stopping_rounds=50)

In [55]:
early_stopping_gbt # a DataFrame

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.195578,0.005825,0.196704,0.006953
1,0.171638,0.016166,0.171474,0.012778
2,0.173197,0.014537,0.172444,0.011176
3,0.170809,0.017857,0.170049,0.014146
4,0.163185,0.006067,0.162265,0.006668
5,0.159240,0.003312,0.157980,0.003241
6,0.159837,0.003604,0.158370,0.003681
7,0.159837,0.002249,0.158304,0.005109
8,0.160168,0.003221,0.158801,0.002364
9,0.160765,0.004690,0.159903,0.001628


In [57]:
# From 518 - 50 to 518, there are almost no changes in test_error_mean
num_boost_round = early_stopping_gbt.shape[0] - 50 // 2
# train the final model
# For xgboost.train, the parameter 'n_estimators' is of no use. We should use the 3rd parameter num_boost_round to control boosting.
final_gbt = xgb.train(xgb_params, xgbmat, num_boost_round)

# Test the final model's performance on test set

In [59]:
test_dmat = xgb.DMatrix(test_X)
from sklearn.metrics import accuracy_score
pred_y = final_gbt.predict(test_dmat)
pred_y[pred_y > 0.5] = 1
pred_y[pred_y <= 0.5] = 0
accuracy_score(pred_y, test_y)

0.86872509960159361

In [65]:
final_gbt.save_model('final_gbt.model')
final_gbt.dump_model('final_gbt.txt')
