In [None]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.datasets import load_boston

dataset = load_boston()
print(dataset.keys())
print(dataset.data.shape)
print(dataset.feature_names)

In [None]:
data= pd.DataFrame(dataset.data)
data.columns= dataset.feature_names

## append target to the dataframe
data['PRICE']= dataset.target

print(data.head())

In [None]:
#data.info()
#data.describe()

In [None]:
X, y= data.iloc[:, :-1], data.iloc[:, -1]

## convert data to optimized data structure called Dmatrix
dmatrix = xgb.DMatrix(data=X, label=y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
# objective : loss function - reg:squarederror for regression, reg:logistic for classification with decision,
#             binary: logistic for classification with probability 
# learning rate : step size in the range of [0,1]
# colsample_bytree: % of features used per tree
# max_depth : how deep each tree is allowed to grow
# alpha : L1 regularization
# n_estimators : no of trees to be built

xg_reg = xgb.XGBRegressor(objective = 'reg:squarederror', max_depth = 5, colsample_bytree= 0.3, 
                          learning_rate = 0.1, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train, y_train)

pred = xg_reg.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, pred))
print("RMSE : %f" %(rmse))

In [None]:
### k-fold cross validation using XGBoost ###
# params is a dictionary containing hyper-parameters and their values as key-value pairs

params = {"objective": "reg:squarederror", 'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10}
cv_results = xgb.cv(dtrain = dmatrix, params = params, nfold =3, num_boost_round = 50, early_stopping_rounds = 10, 
                    metrics = "rmse", as_pandas = True, seed= 123)

In [None]:
cv_results.head()

In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
### Visualize Boosting trees and feature importance ###

xg_reg = xgb.train(params=params, dtrain=dmatrix, num_boost_round=10)

In [None]:
# Plotting the first tree

import matplotlib.pyplot as plt

xgb.plot_tree(xg_reg,num_trees=0)
plt.rcParams['figure.figsize'] = [50, 10]
plt.show()

In [None]:
# Examine the importance of feature column

xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [5, 5]
plt.show()