# Extreme Gradient Boosting

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import graphviz
import xgboost as xgb
import os
os.chdir('/Users/Vincent/Desktop/Python/DataCamp/Supervised_sklearn/Data')

## internal data structures from package: memory efficient + fast in training

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

# Ames housing data
housing = pd.read_csv('ames_housing_trimmed_processed.csv')
X, y = housing.iloc[:,:-1], housing.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# XGBRegressor
xg_reg = xgb.XGBRegressor(n_estimators= 10,objective='reg:squarederror', booster='gbtree', seed=123)
xg_reg.fit(X_train, y_train)

# Prediction
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(preds, y_test))
print("RMSE using pandas DataFrame: %f" % (rmse))

# Internal data structure from XGBOOST package
DM_train = xgb.DMatrix(X_train, y_train)
DM_test =  xgb.DMatrix(X_test, y_test)

# Create the parameter dictionary
params = {"booster":"gblinear", "objective":"reg:squarederror"}
xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5)

# Predict the labels of the test set
preds = xg_reg.predict(DM_test)
rmse = np.sqrt(mean_squared_error(y_test,preds))
print("RMSE using DMatrix data structure : %f" % (rmse))

RMSE using pandas DataFrame: 78847.401758
RMSE using DMatrix data structure : 44331.645061


## Cross-validation

In [3]:
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:squarederror", "max_depth":4}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5
                    , metrics="rmse", as_pandas=True, seed=123)

print(cv_results,'\n')
# final round of boosting
print("Last boosting round has mean test RMSE: {}".format(cv_results["test-rmse-mean"].tail(1).iloc[0]))

   train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0    141767.488281      429.449371   142980.464844    1193.806011
1    102832.562500      322.503447   104891.398438    1223.161012
2     75872.621094      266.493573    79478.947265    1601.341377
3     57245.657226      273.633063    62411.919922    2220.151162
4     44401.291992      316.426590    51348.276367    2963.378029 

Last boosting round has mean test RMSE: 51348.27636725


In [4]:
# varying l2 strength
reg_params = [1, 10, 100]
# xgboost parameters
params = {"objective":"reg:squarederror","max_depth":3}
rmses_l2 = []

# Iterate over reg_params
for reg in reg_params:

    # Update l2 strength
    params["lambda"] = reg

    cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, num_boost_round=5,
                             metrics="rmse", as_pandas=True, seed=123)
    
    rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

# Look at best rmse per l2 param
print("Best rmse as a function of l2:")
print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2","rmse"]))

Best rmse as a function of l2:
    l2          rmse
0    1  52275.355469
1   10  57746.060547
2  100  76624.617188


## Model tuning

In [5]:
# tune the amount of rounds
num_rounds = [5, 10, 15]
final_rmse_per_round = []

for curr_num_rounds in num_rounds:
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3
                        , num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123)
    
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]),'\n')

# Perform cross-validation with early stopping
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50
                    , early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)
print(cv_results.tail(10))

   num_boosting_rounds          rmse
0                    5  73326.197917
1                   10  47675.869792
2                   15  40710.986979 

    train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
40     26401.001302      709.696138    32960.731120    1889.183710
41     26197.427735      742.047407    32856.434896    1837.397750
42     26006.160807      711.238348    32739.975260    1842.710087
43     25834.891927      706.683724    32659.485677    1847.711983
44     25660.932943      716.522789    32566.152995    1873.025363
45     25518.350911      717.870407    32510.390625    1900.405997
46     25352.604167      694.735496    32436.325521    1918.376835
47     25198.511719      683.670090    32348.590495    1900.253949
48     25062.175130      686.529028    32252.923177    1902.939875
49     24914.595052      688.357009    32168.564453    1892.425228


In [6]:
# Tune learning rate eta
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

# Systematically vary the eta
for curr_val in eta_vals:

    params["eta"] = curr_val
    
    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3,
                        num_boost_round=10, early_stopping_rounds=5,
                        metrics="rmse", as_pandas=True, seed=123)
    
    # best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"]))

     eta      best_rmse
0  0.001  196079.104167
1  0.010  183249.000000
2  0.100   99517.726562


In [7]:
# tune max_depth values
max_depths = [2,5,10,20]
best_rmse = []

for curr_val in max_depths:

    params["max_depth"] = curr_val
    
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2
                        ,num_boost_round=10, 
                        early_stopping_rounds=5,
                        metrics="rmse", as_pandas=True, seed=123)
    
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

   max_depth      best_rmse
0          2  102665.269532
1          5  102752.746093
2         10  102751.972657
3         20  102751.972657


In [8]:
# tune amount of features to use
colsample_bytree_vals = [0.1,0.5,0.8,1]
best_rmse = []

for curr_val in colsample_bytree_vals:

    params["colsample_bytree"] = curr_val
    
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)
    
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))

   colsample_bytree      best_rmse
0               0.1  105409.710938
1               0.5  103051.402343
2               0.8  102617.707031
3               1.0  102751.972657


# Grid Search

In [13]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
# parameter grid:
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

gbm = xgb.XGBRegressor(objective='reg:squarederror')

# Perform grid search, 4 folds
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                        scoring='neg_mean_squared_error', cv=4, verbose=1)
grid_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    2.4s finished


Best parameters found:  {'colsample_bytree': 0.7, 'max_depth': 5, 'n_estimators': 50}
Lowest RMSE found:  30540.19922467927


## Random Search

In [14]:
from sklearn.model_selection import RandomizedSearchCV
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

gbm = xgb.XGBRegressor(n_estimators=10, objective="reg:squarederror")
randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm,
                                    scoring='neg_mean_squared_error',cv=4,verbose=1,n_iter=5)

randomized_mse.fit(X,y)

print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best parameters found:  {'n_estimators': 25, 'max_depth': 5}
Lowest RMSE found:  36636.35808132903


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    3.6s finished


## Pipelines for XGBoost

In [24]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(objective='reg:squarederror'))]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Fit the pipeline
xgb_pipeline.fit(X.to_dict("records"), y)

Pipeline(memory=None,
     steps=[('ohe_onestep', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('xgb_model', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rat...lpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1))])

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:squarederror"))]
xgb_pipeline = Pipeline(steps)

# Cross-validate the model
cross_val_scores = cross_val_score(xgb_pipeline,X.to_dict("records"),y,
                                   scoring='neg_mean_squared_error',cv=10)
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))

10-fold RMSE:  29867.603720688923
