## Demo_XGBoost

Reference:
- https://gist.github.com/ylogx/53fef94cc61d6a3e9b3eb900482f41e0
- https://github.com/dmlc/xgboost/issues/56
- https://github.com/dmlc/xgboost/issues/3055

In [1]:
%%time
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import ShuffleSplit
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as mse

boston = load_boston()
features = boston.feature_names
X = boston.data
y = boston.target

X = pd.DataFrame(X, columns=features)
y = pd.Series(y, index=X.index)


# split data into training and testing sets
rs = ShuffleSplit(test_size=0.3, n_splits=1, random_state=0)
for train_idx,test_idx in rs.split(X):  # this looks silly
    pass

train_split = round(len(train_idx) / 2)
train1_idx = train_idx[:train_split]
train2_idx = train_idx[train_split:]
X_train = X.loc[train_idx]
X_train_1 = X.loc[train1_idx]
X_train_2 = X.loc[train2_idx]
X_test = X.loc[test_idx]
y_train = y.loc[train_idx]
y_train_1 = y.loc[train1_idx]
y_train_2 = y.loc[train2_idx]
y_test = y.loc[test_idx]

xg_train_0 = xgb.DMatrix(X_train, label=y_train)
xg_train_1 = xgb.DMatrix(X_train_1, label=y_train_1)
xg_train_2 = xgb.DMatrix(X_train_2, label=y_train_2)
xg_test = xgb.DMatrix(X_test, label=y_test)

num_round = 15
verbose_eval = 5
watch_list = [(xg_test, 'xg_test')]

params = {'objective': 'reg:linear', 'verbose': False}
print('full train\t'); 
model_0 = xgb.train(params, xg_train_0, num_round, watch_list, verbose_eval=verbose_eval)
print('model 1 \t'); 
model_1 = xgb.train(params, xg_train_1, num_round, watch_list, verbose_eval=verbose_eval)
model_1.save_model('model_1.model')
print('model 2 \t'); 
model_2_v1 = xgb.train(params, xg_train_2, num_round, watch_list, verbose_eval=verbose_eval)
print('model 1+2\t, this logs show continue train, but got a test score same to model 1?'); 
model_2_v2 = xgb.train(params, xg_train_2, num_round, watch_list, verbose_eval=verbose_eval, xgb_model=model_1)

params.update({
    'process_type': 'update',
    'updater': 'refresh',
    'refresh_leaf': True,
})
print('model 1+update2\t, this logs do not show continue train, but got a best test score?'); 
model_2_v2_update = xgb.train(params, xg_train_2, num_round, watch_list, verbose_eval=verbose_eval, xgb_model=model_1)

print('full train\t', mse(model_0.predict(xg_test), y_test)) # benchmark
print('model 1 \t', mse(model_1.predict(xg_test), y_test))  
print('model 2 \t', mse(model_2_v1.predict(xg_test), y_test))  # "before"
print('model 1+2\t', mse(model_2_v2.predict(xg_test), y_test))  # "after"
print('model 1+update2\t', mse(model_2_v2_update.predict(xg_test), y_test))  # "after"

full train	
[0]	xg_test-rmse:16.9311
[5]	xg_test-rmse:5.36819
[10]	xg_test-rmse:4.41758
[14]	xg_test-rmse:4.28357
model 1 	
[0]	xg_test-rmse:17.078
[5]	xg_test-rmse:6.0592
[10]	xg_test-rmse:5.04216
[14]	xg_test-rmse:4.94968
model 2 	
[0]	xg_test-rmse:16.9631
[5]	xg_test-rmse:6.08084
[10]	xg_test-rmse:5.18633
[14]	xg_test-rmse:5.12085
model 1+2	, this logs show continue train, but got a test score same to model 1?
[0]	xg_test-rmse:4.72028
[5]	xg_test-rmse:4.79626
[10]	xg_test-rmse:4.96232
[14]	xg_test-rmse:4.96374
model 1+update2	, this logs do not show continue train, but got a best test score?
[0]	xg_test-rmse:17.0353
[5]	xg_test-rmse:5.0438
[10]	xg_test-rmse:3.9661
[14]	xg_test-rmse:3.91295
full train	 18.348929164523298
model 1 	 24.499370663166886
model 2 	 26.223108502105553
model 1+2	 24.63867891225536
model 1+update2	 15.311159054248336
CPU times: user 16.8 s, sys: 5.22 s, total: 22 s
Wall time: 2.67 s
