In [60]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import xgboost as xgb

In [61]:
# Fairly standard XGB example using the Boston Housing dataset.
boston = load_boston()

In [62]:
# The Boston object gives us data and targets as arrays, so all that needs to be done for train_test_split is assigning X and y.
X = pd.DataFrame(boston.data)
y = pd.DataFrame(boston.target)
X.columns = boston.feature_names
y.columns = ['PRICE']

In [70]:
# DMatrix generation for use with cross validation.
dmat = xgb.DMatrix(data=X, label=y)

In [71]:
# 0.2 should be fine for the split point.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [72]:
# Lots of hyperparameters.
model = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1, max_depth=5, alpha=10, n_estimators=10).fit(X_train, y_train)

In [73]:
# Initial set of predictions.
predictions = model.predict(X_test)

In [74]:
# RMSE for y_test vs. predictions.
RMSE = np.round(np.sqrt(mean_squared_error(y_test, predictions)),2)

In [75]:
# For comparison.
print("Test RMSE: {}".format(RMSE))

Test RMSE: 9.6


In [78]:
# Cross validation parameters and model fitting.
params = {"objective":"reg:squarederror", "colsample_bytree":0.3, "learning_rate":0.1, 'max_depth':5, 'alpha':10}
cv_results = xgb.cv(dtrain=dmat, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=6)

In [103]:
# Generating CV results. Train and Test RMSEs ran pretty close to one another, so no drastic overfitting occuring.
# The tail will give us the final test RMSE, which is really what we are looking to evaluate vis-a-vis the last time.
last_test_RMSE = np.round(cv_results['test-rmse-mean'][len(cv_results)-1],2)

In [104]:
# Well lower than the original, but need to calculate the improvement.
print("CV Test RMSE: {}".format(last_test_RMSE))

CV Test RMSE: 3.98


In [108]:
# Improvement calculation to round things off.
difference = np.round((RMSE - last_test_RMSE),2)

In [110]:
# Not bad!
print("RMSE Improvement: {}".format(difference))

RMSE Improvement: 5.62
