# Setup

Make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline

In [668]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # for retina screens
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

Configure notebook to display all results in cell

In [669]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Revert to the last line of output only
# InteractiveShell.ast_node_interactivity = "last_expr"

# Load Data

In [670]:
import pandas as pd

houses_train = pd.read_csv('../Data/encoded_houses_train.csv')
houses_test = pd.read_csv('../Data/encoded_houses_test.csv')

In [671]:
print("houses_train dimensions: {}".format(houses_train.shape))
print("houses_test dimensions: {}".format(houses_test.shape))

houses_train dimensions: (1460, 264)
houses_test dimensions: (1459, 264)


In [672]:
pd.set_option("display.max_columns", 400)
houses_train.head(3)

Unnamed: 0,MSSubClass30,MSSubClass50,MSSubClass60,MSSubClass70,MSSubClass80,MSSubClass85,MSSubClass90,MSSubClass120,MSSubClass160,MSSubClass190,MSZoningFV,MSZoningRH,MSZoningRL,MSZoningRM,LotFrontage,LotArea,StreetPave,AlleyPave,AlleyNA,LotShapeIR1,LotShapeIR2,LotShapeIR3,LandContourBnk,LandContourHLS,LandContourLow,LotConfigCorner,LotConfigCulDSac,LotConfigFR2,LandSlopeMod,LandSlopeSev,NeighborhoodBlueste,NeighborhoodBrDale,NeighborhoodBrkSide,NeighborhoodClearCr,NeighborhoodCollgCr,NeighborhoodCrawfor,NeighborhoodEdwards,NeighborhoodGilbert,NeighborhoodIDOTRR,NeighborhoodMeadowV,NeighborhoodMitchel,NeighborhoodNAmes,NeighborhoodNoRidge,NeighborhoodNPkVill,NeighborhoodNridgHt,NeighborhoodNWAmes,NeighborhoodOldTown,NeighborhoodSWISU,NeighborhoodSawyer,NeighborhoodSawyerW,NeighborhoodSomerst,NeighborhoodStoneBr,NeighborhoodTimber,NeighborhoodVeenker,Condition1Feedr,Condition1Norm,Condition1RRAn,Condition1PosN,Condition1RRAe,BldgType2fmCon,BldgTypeDuplex,BldgTypeTwnhsE,BldgTypeTwnhs,HouseStyle1.5Fin,HouseStyle2Story,HouseStyle2.5Fin,HouseStyleSFoyer,HouseStyleSLvl,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyleGable,RoofStyleOther,RoofStyleHip,RoofMatlCompShg,RoofMatlTar&Grv,RoofMatlWood,Exterior1stBrick,Exterior1stCement,Exterior1stHdBoard,Exterior1stStucco,Exterior1stMetalSd,Exterior1stVinylSd,Exterior1stPlywood,Exterior1stWd Sdng,Exterior2ndBrick,Exterior2ndCement,Exterior2ndHdBoard,Exterior2ndStucco,Exterior2ndMetalSd,Exterior2ndVinylSd,Exterior2ndPlywood,Exterior2ndWd Sdng,Exterior2ndWd Shng,MasVnrTypeBrkFace,MasVnrTypeNone,MasVnrTypeStone,MasVnrArea,ExterQual4,ExterQual3,ExterQual2,ExterCondGd,ExterCondTA,ExterCondFa,ExterCondPo,FoundationCBlock,FoundationPConc,FoundationSlab,FoundationOther,BsmtQualGd,BsmtQualTA,BsmtQualFa,BsmtQualNA,BsmtCondTA,BsmtCondFa,BsmtCondNA,BsmtExposureAv,BsmtExposureMn,BsmtExposureNo,BsmtExposureNA,BsmtFinType1ALQ,BsmtFinType1BLQ,BsmtFinType1Rec,BsmtFinType1LwQ,BsmtFinType1Unf,BsmtFinType1NA,BsmtFinSF1,BsmtFinType2ALQ,BsmtFinType2BLQ,BsmtFinType2Rec,BsmtFinType2LwQ,BsmtFinType2Unf,BsmtFinType2NA,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HeatingGasA,HeatingGasW,HeatingQC4,HeatingQC3,HeatingQC2,HeatingQC1,CentralAirY,ElectricalFuseA,ElectricalFuseFP,X1stFlrSF,X2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual4,KitchenQual3,KitchenQual2,TotRmsAbvGrd,FunctionalMin1,FunctionalMin2,FunctionalMod,FunctionalMaj,Fireplaces,FireplaceQuGd,FireplaceQuTA,FireplaceQuFa,FireplaceQuPo,FireplaceQuNA,GarageTypeAttchd,GarageTypeBasment,GarageTypeBuiltIn,GarageTypeDetchd,GarageTypeNA,GarageYrBlt,GarageFinishRFn,GarageFinishUnf,GarageFinishNA,GarageCars,GarageArea,GarageQualTA,GarageQualFaPo,GarageQualNA,GarageCondTA,GarageCondFaPo,GarageCondNA,PavedDriveP,PavedDriveN,WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,PoolQCNA,FenceMnPrv,FenceGdWo,FenceMnWw,FenceNA,MiscFeatureShed,MiscVal,SaleTypeNew,SaleTypeCOD,SaleTypeCon,SaleConditionAbnorml,SaleConditionAlloca,SaleConditionFamily,SaleConditionPartial,LotArea.LandContour.interaction7478.Lvl,LotArea.LandContour.interaction9453.Lvl,LotArea.LandContour.interaction11570.Lvl,LotArea.LandContour.interaction215245.Lvl,LotArea.LandContour.interaction1300.Bnk,LotArea.LandContour.interaction7478.Bnk,LotArea.LandContour.interaction9453.Bnk,LotArea.LandContour.interaction11570.Bnk,LotArea.LandContour.interaction215245.Bnk,LotArea.LandContour.interaction1300.HLS,LotArea.LandContour.interaction7478.HLS,LotArea.LandContour.interaction9453.HLS,LotArea.LandContour.interaction11570.HLS,LotArea.LandContour.interaction215245.HLS,LotArea.LandContour.interaction1300.Low,LotArea.LandContour.interaction7478.Low,LotArea.LandContour.interaction9453.Low,LotArea.LandContour.interaction11570.Low,LotArea.LandContour.interaction215245.Low,Garage.interaction1.ExGd,Garage.interaction2.ExGd,Garage.interaction3.ExGd,Garage.interaction1.TA,Garage.interaction2.TA,Garage.interaction3.TA,Garage.interaction4.TA,Garage.interaction1.FaPo,Garage.interaction2.FaPo,Garage.interaction3.FaPo,Basement.interactionGd.0,Basement.interactionTA.0,Basement.interactionFa.0,Basement.interactionNA.0,Basement.interactionEx.1,Basement.interactionGd.1,Basement.interactionTA.1,Basement.interactionFa.1,Basement.interactionEx.2,Basement.interactionGd.2,Basement.interactionTA.2,Basement.interactionFa.2,Basement.interactionGd.3,Kitchen.interaction1.4,Kitchen.interaction1.3,Kitchen.interaction2.3,Kitchen.interaction1.2,Kitchen.interaction2.2,new.old,Room.size,full.YrSold,QuarterSold,TotalBath,BathToBed,AvgHouseLivArea.ratio,SalePrice
0,-0.222645,-0.34576,1.905894,-0.206949,-0.203325,-0.117811,-0.192111,-0.251638,-0.229337,-0.144792,-0.215785,-0.105227,0.517956,-0.418812,-0.24015,-0.207071,0.064216,-0.169923,0.257733,-0.703962,-0.169923,-0.083017,-0.212287,-0.188246,-0.158945,-0.468578,-0.262234,-0.190187,-0.215785,-0.094752,-0.037024,-0.105227,-0.203325,-0.139784,2.954209,-0.190187,-0.27107,-0.239094,-0.161194,-0.108503,-0.186288,-0.426687,-0.169923,-0.07873,-0.235877,-0.229337,-0.289539,-0.131946,-0.230986,-0.205144,-0.250096,-0.131946,-0.163415,-0.087099,-0.242277,0.398273,-0.147237,-0.137218,-0.094752,-0.147237,-0.192111,-0.290925,-0.174141,-0.360475,1.509747,-0.114788,-0.161194,-0.215785,-0.651256,0.517023,1.050634,0.878367,0.528571,-0.117811,-0.493401,0.134606,-0.087099,-0.087099,-0.195909,-0.21052,-0.423319,-0.134606,-0.421067,1.35414,-0.282537,-0.405169,-0.161194,-0.208741,-0.406313,-0.158945,-0.414285,1.374698,-0.328124,-0.394805,-0.163415,1.509747,-1.217365,-0.309888,0.513928,1.410829,-1.278381,-0.098363,-0.333219,0.372492,-0.139784,-0.026171,-0.875802,1.120584,-0.129235,-0.07873,1.166845,-0.894259,-0.156667,-0.161194,0.33701,-0.182318,-0.161194,-0.422194,-0.290925,0.729136,-0.163415,-0.421067,-0.335749,-0.316477,-0.230986,-0.645902,-0.161194,0.575228,-0.114788,-0.152018,-0.195909,-0.180304,0.402876,-0.163415,-0.288554,-0.944267,-0.459145,0.149645,-0.111688,-0.444486,-0.643774,-0.186288,-0.026171,0.263722,-0.262234,-0.147237,-0.793162,1.161454,-0.120201,0.370207,1.107431,-0.240978,0.78947,1.227165,0.163723,-0.211381,1.220838,-1.006528,-0.16561,0.911897,-0.147237,-0.154359,-0.10185,-0.117811,-0.950901,-0.592968,-0.522206,-0.152018,-0.117811,1.05602,0.823223,-0.114788,-0.253172,-0.600353,-0.242277,1.00706,1.567811,-0.840903,-0.242277,0.311618,0.35088,0.33701,-0.190187,-0.242277,0.317784,-0.172043,-0.242277,-0.144792,-0.25622,-0.751918,0.216429,-0.359202,-0.116299,-0.270116,-0.068668,0.069385,-0.346999,-0.195909,-0.087099,0.488031,-0.186288,-0.087658,-0.301858,-0.174141,-0.129235,-0.278276,-0.091003,-0.117811,-0.30589,-0.457676,-0.469664,-0.457676,-0.472919,-0.064216,-0.111688,-0.091003,-0.105227,-0.087099,-0.069385,-0.083017,-0.094752,-0.087099,-0.07873,-0.058601,-0.087099,-0.045361,-0.091003,-0.058601,-0.052396,-0.091003,-0.026171,-0.53916,0.90924,-0.372492,-0.058601,-0.158945,-0.094752,-0.037024,-0.517956,-0.583478,-0.139784,-0.161194,-0.23749,1.973995,-0.474002,-0.064216,-0.058601,-0.083017,-0.083017,-0.026171,-0.026171,1.220838,-0.928253,-0.205144,-0.154359,-0.058601,0.228869,-0.377916,0.106482,-1.5682,1.647115,0.513377,-0.317942,208500.0
1,-0.222645,-0.34576,-0.524329,-0.206949,-0.203325,-0.117811,-0.192111,-0.251638,-0.229337,-0.144792,-0.215785,-0.105227,0.517956,-0.418812,0.340726,-0.091855,0.064216,-0.169923,0.257733,-0.703962,-0.169923,-0.083017,-0.212287,-0.188246,-0.158945,-0.468578,-0.262234,5.254382,-0.215785,-0.094752,-0.037024,-0.105227,-0.203325,-0.139784,-0.338268,-0.190187,-0.27107,-0.239094,-0.161194,-0.108503,-0.186288,-0.426687,-0.169923,-0.07873,-0.235877,-0.229337,-0.289539,-0.131946,-0.230986,-0.205144,-0.250096,-0.131946,-0.163415,11.473319,4.124686,-2.50912,-0.147237,-0.137218,-0.094752,-0.147237,-0.192111,-0.290925,-0.174141,-0.360475,-0.661909,-0.114788,-0.161194,-0.215785,0.071812,-2.178881,0.15668,-0.42943,0.528571,-0.117811,-0.493401,0.134606,-0.087099,-0.087099,-0.195909,-0.21052,-0.423319,-0.134606,2.37329,-0.73797,-0.282537,-0.405169,-0.161194,-0.208741,-0.406313,-0.158945,2.412145,-0.726934,-0.328124,-0.394805,-0.163415,-0.661909,0.820884,-0.309888,-0.570555,-0.708318,0.781703,-0.098363,-0.333219,0.372492,-0.139784,-0.026171,1.141029,-0.891781,-0.129235,-0.07873,1.166845,-0.894259,-0.156667,-0.161194,0.33701,-0.182318,-0.161194,-0.422194,-0.290925,-1.370546,-0.163415,2.37329,-0.335749,-0.316477,-0.230986,-0.645902,-0.161194,1.171591,-0.114788,-0.152018,-0.195909,-0.180304,0.402876,-0.163415,-0.288554,-0.641008,0.466305,0.149645,-0.111688,-0.444486,-0.643774,-0.186288,-0.026171,0.263722,-0.262234,-0.147237,0.257052,-0.794891,-0.120201,-0.482347,-0.819684,3.947457,0.78947,-0.76136,0.163723,-0.211381,-0.818548,0.992834,-0.16561,-0.318574,-0.147237,-0.154359,-0.10185,-0.117811,0.600289,-0.592968,1.913642,-0.152018,-0.117811,-0.946303,0.823223,-0.114788,-0.253172,-0.600353,-0.242277,-0.019293,1.567811,-0.840903,-0.242277,0.311618,-0.06071,0.33701,-0.190187,-0.242277,0.317784,-0.172043,-0.242277,-0.144792,-0.25622,1.625638,-0.704242,-0.359202,-0.116299,-0.270116,-0.068668,0.069385,-0.346999,-0.195909,-0.087099,0.488031,-0.186288,-0.087658,-0.301858,-0.174141,-0.129235,-0.278276,-0.091003,-0.117811,-0.30589,2.183457,-0.469664,-0.457676,-0.472919,-0.064216,-0.111688,-0.091003,-0.105227,-0.087099,-0.069385,-0.083017,-0.094752,-0.087099,-0.07873,-0.058601,-0.087099,-0.045361,-0.091003,-0.058601,-0.052396,-0.091003,-0.026171,-0.53916,0.90924,-0.372492,-0.058601,-0.158945,-0.094752,-0.037024,-0.517956,-0.583478,-0.139784,-0.161194,-0.23749,1.973995,-0.474002,-0.064216,-0.058601,-0.083017,-0.083017,-0.026171,-0.026171,-0.818548,1.076554,-0.205144,-0.154359,-0.058601,-0.232155,-0.467335,-0.625911,-0.486683,0.382768,-0.791424,-1.163214,181500.0
2,-0.222645,-0.34576,1.905894,-0.206949,-0.203325,-0.117811,-0.192111,-0.251638,-0.229337,-0.144792,-0.215785,-0.105227,0.517956,-0.418812,-0.123975,0.073455,0.064216,-0.169923,0.257733,1.419559,-0.169923,-0.083017,-0.212287,-0.188246,-0.158945,-0.468578,-0.262234,-0.190187,-0.215785,-0.094752,-0.037024,-0.105227,-0.203325,-0.139784,2.954209,-0.190187,-0.27107,-0.239094,-0.161194,-0.108503,-0.186288,-0.426687,-0.169923,-0.07873,-0.235877,-0.229337,-0.289539,-0.131946,-0.230986,-0.205144,-0.250096,-0.131946,-0.163415,-0.087099,-0.242277,0.398273,-0.147237,-0.137218,-0.094752,-0.147237,-0.192111,-0.290925,-0.174141,-0.360475,1.509747,-0.114788,-0.161194,-0.215785,-0.651256,0.517023,0.984415,0.82993,0.528571,-0.117811,-0.493401,0.134606,-0.087099,-0.087099,-0.195909,-0.21052,-0.423319,-0.134606,-0.421067,1.35414,-0.282537,-0.405169,-0.161194,-0.208741,-0.406313,-0.158945,-0.414285,1.374698,-0.328124,-0.394805,-0.163415,1.509747,-1.217365,-0.309888,0.325803,1.410829,-1.278381,-0.098363,-0.333219,0.372492,-0.139784,-0.026171,-0.875802,1.120584,-0.129235,-0.07873,1.166845,-0.894259,-0.156667,-0.161194,0.33701,-0.182318,-0.161194,-0.422194,3.434957,-1.370546,-0.163415,-0.421067,-0.335749,-0.316477,-0.230986,-0.645902,-0.161194,0.092875,-0.114788,-0.152018,-0.195909,-0.180304,0.402876,-0.163415,-0.288554,-0.30154,-0.313261,0.149645,-0.111688,-0.444486,-0.643774,-0.186288,-0.026171,0.263722,-0.262234,-0.147237,-0.627611,1.188943,-0.120201,0.514836,1.107431,-0.240978,0.78947,1.227165,0.163723,-0.211381,1.220838,-1.006528,-0.16561,-0.318574,-0.147237,-0.154359,-0.10185,-0.117811,0.600289,-0.592968,1.913642,-0.152018,-0.117811,-0.946303,0.823223,-0.114788,-0.253172,-0.600353,-0.242277,0.931034,1.567811,-0.840903,-0.242277,0.311618,0.63151,0.33701,-0.190187,-0.242277,0.317784,-0.172043,-0.242277,-0.144792,-0.25622,-0.751918,-0.070337,-0.359202,-0.116299,-0.270116,-0.068668,0.069385,-0.346999,-0.195909,-0.087099,0.488031,-0.186288,-0.087658,-0.301858,-0.174141,-0.129235,-0.278276,-0.091003,-0.117811,-0.30589,-0.457676,2.127722,-0.457676,-0.472919,-0.064216,-0.111688,-0.091003,-0.105227,-0.087099,-0.069385,-0.083017,-0.094752,-0.087099,-0.07873,-0.058601,-0.087099,-0.045361,-0.091003,-0.058601,-0.052396,-0.091003,-0.026171,-0.53916,0.90924,-0.372492,-0.058601,-0.158945,-0.094752,-0.037024,-0.517956,-0.583478,-0.139784,-0.161194,-0.23749,1.973995,-0.474002,-0.064216,-0.058601,-0.083017,-0.083017,-0.026171,-0.026171,1.220838,-0.928253,-0.205144,-0.154359,-0.058601,0.034614,1.499874,0.159335,0.594835,1.647115,0.513377,-0.132986,223500.0


In [673]:
houses_train.info()
houses_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 264 entries, MSSubClass30 to SalePrice
dtypes: float64(264)
memory usage: 2.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 264 entries, MSSubClass30 to SalePrice
dtypes: float64(263), int64(1)
memory usage: 2.9 MB


Delete first column

In [674]:
# houses_train.drop('Unnamed: 0', axis=1, inplace=True, errors='raise')
# houses_test.drop('Unnamed: 0', axis=1, inplace=True, errors='raise')

# Run xgboost

### Create private training & test set

In [675]:
from sklearn.model_selection import train_test_split

seed = 10
test_ratio = 0.2

X = houses_train.loc[:, houses_train.columns != "SalePrice"].values # convert to np.array
y = houses_train.loc[:, houses_train.columns == "SalePrice"].values

# Take log of SalePrice
y = np.log(y + 1).ravel() # convert to 1D array for model fit (xxx, )


In [676]:
X_pr_train, X_pr_test, y_pr_train, y_pr_test = train_test_split(X, y, test_size=test_ratio, random_state=seed)

In [677]:
print(len(X_pr_train), "train +", len(X_pr_test), "test")

1168 train + 292 test


### Fit Model

In [678]:
from xgboost import XGBRegressor

xgb_clf = XGBRegressor(max_depth=3, 
                        learning_rate=0.1, 
                        n_estimators=1000, # Number of boosted trees to fit
                        silent=False, # print messages while running 
                        objective='reg:linear', 
                        booster='gbtree', # Specify which booster to use: gbtree, gblinear or dart
                        #for dart see http://xgboost.readthedocs.io/en/latest/tutorials/dart.html 
                        n_jobs=-1, # Number of parallel threads used to run xgboost. (replaces nthread)
                        gamma=0,  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                        min_child_weight=1, # Minimum sum of instance weight(hessian) needed in a child
                        max_delta_step=0, # Maximum delta step we allow each tree’s weight estimation to be
                        subsample=1, # Subsample ratio of the training instance
                        colsample_bytree=1, # Subsample ratio of columns when constructing each tree
                        colsample_bylevel=1, # Subsample ratio of columns for each split, in each level
                        reg_alpha=0, # L1 regularization term on weights
                        reg_lambda=1, # L2 regularization term on weights
                        scale_pos_weight=1, # Balancing of positive and negative weights
                        base_score=0.5, # The initial prediction score of all instances, global bias
                        random_state=743, 
                        missing=None) # Value in the data which needs to be present as a missing value. If None, defaults to np.nan



In [679]:
xgb_clf.fit(X_pr_train, y_pr_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=743,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

In [680]:
# make predictions for test data

y_pr_pred = xgb_clf.predict(X_pr_test)

Evaluate predictions

In [681]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_pr_test, y_pr_pred)
rmse = np.sqrt(mse)
rmse


0.12973293846602865

In [682]:
y_pr_test[1:5]
y_pr_pred[1:5]

array([ 12.1428719 ,  11.8277435 ,  12.01067193,  12.64109979])

array([ 12.21458626,  11.84240341,  11.88090611,  12.6495924 ], dtype=float32)

Save model to file

In [683]:
from sklearn.externals import joblib # More memory efficient than pickle for large numpy arrays

joblib.dump(xgb_clf, './Models/xgboost_model.pkl') 

['./Models/xgboost_model.pkl']

To load a model:

In [684]:
# xgb_clf_loaded = joblib.load('./Models/xgboost_model.pkl') 

### Model Tuning

In [685]:
from sklearn.model_selection import GridSearchCV

xgb_params = {'max_depth': 3,
              'learning_rate': 0.1, 
              'n_estimators': 100, 
              'objective': 'reg:linear'}

param_grid = {'max_depth': [3, 5], 
              'learning_rate': [0.05, 0.07, 0.09], 
              'n_estimators': [1500, 2000, 2500], # Number of boosted trees to fit
              'objective': ['reg:linear'], 
              'booster': ['gbtree'], # Specify which booster to use: gbtree, gblinear or dart
              'n_jobs': [-1], # Number of parallel threads used to run xgboost. (replaces nthread)
              'gamma': [0],  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
              'min_child_weight': [1], # Minimum sum of instance weight(hessian) needed in a child
              'max_delta_step': [0], # Maximum delta step we allow each tree’s weight estimation to be
              'subsample': [1], # Subsample ratio of the training instance
              'colsample_bytree': [0.6], # Subsample ratio of columns when constructing each tree
              'colsample_bylevel': [0.3], # Subsample ratio of columns for each split, in each level
              'reg_alpha': [0], # L1 regularization term on weights
              'reg_lambda': [1], # L2 regularization term on weights
              'scale_pos_weight': [1], # Balancing of positive and negative weights
              'base_score': [0.5], # The initial prediction score of all instances, global bias
              'silent': [True],
              'random_state': [10]}

optimized_xgb_clf = GridSearchCV(XGBRegressor(**xgb_params), # scikit-learn estimator interface 
                                 param_grid = param_grid, # Dictionary with parameters names (string) as keys
                                 scoring="neg_mean_squared_error", # controls what metric they apply to the estimators evaluated
                                 n_jobs=-1, # If True, the data is assumed to be identically distributed across the folds,
                                 iid=True, 
                                 refit=True, # Refit an estimator using the best found parameters (best_estimator_)
                                 cv=3, #integer, to specify the number of folds in a (Stratified)KFold. None -> default 3-fold cross validation
                                 verbose=10, # the higher, the more messages
                                 pre_dispatch="2*n_jobs", # number of jobs that get dispatched during parallel execution
                                 error_score="raise", 
                                 return_train_score=False) #If False, the cv_results_ attribute will not include training scores

In [686]:
# param_grid = {'max_depth': [3, 4, 5], 
#               'learning_rate': [0.07], 
#               'n_estimators': [1500, 1800, 2000, 2200, 2400], # Number of boosted trees to fit
#               'objective': ['reg:linear'], 
#               'booster': ['gbtree'], # Specify which booster to use: gbtree, gblinear or dart
#               'n_jobs': [-1], # Number of parallel threads used to run xgboost. (replaces nthread)
#               'gamma': [0, 0.5],  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
#               'min_child_weight': [1], # Minimum sum of instance weight(hessian) needed in a child
#               'max_delta_step': [0], # Maximum delta step we allow each tree’s weight estimation to be
#               'subsample': [1], # Subsample ratio of the training instance
#               'colsample_bytree': [0.6], # Subsample ratio of columns when constructing each tree
#               'colsample_bylevel': [0.3], # Subsample ratio of columns for each split, in each level
#               'reg_alpha': [0], # L1 regularization term on weights
#               'reg_lambda': [1], # L2 regularization term on weights
#               'scale_pos_weight': [1], # Balancing of positive and negative weights
#               'base_score': [0.5], # The initial prediction score of all instances, global bias
#               'silent': [True],
#               'random_state': [10]}

Inspect the grid

In [None]:
optimized_xgb_clf

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 5], 'learning_rate': [0.05, 0.07, 0.09], 'n_estimators': [1500, 2000, 2500], 'objective': ['reg:linear'], 'booster': ['gbtree'], 'n_jobs': [-1], 'gamma': [0], 'min_child_weight': [1], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [0.6], 'colsample_bylevel': [0.3], 'reg_alpha': [0], 'reg_lambda': [1], 'scale_pos_weight': [1], 'base_score': [0.5], 'silent': [True], 'random_state': [10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
     

Run grid tuning

In [None]:
optimized_xgb_clf.fit(X_pr_train, y_pr_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, booste

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.0s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.01752412567377578, total=   7.8s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.013681817144109222, total=   7.6s
[

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   18.3s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014832122235590831, total=   6.1s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.016242615370238328, total=   6.0s


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   29.9s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=2500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014832122235590831, total=   7.5s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015500376363577767, total=   4.2s


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   38.3s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.0179084355090726, total=   6.9s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.018795095434771168, total=   4.9s
[C

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   51.8s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=2500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.018795095434771168, total=   6.4s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.018160325225740123, total=   4.2s


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min


The best combination of parameters is:

In [None]:
optimized_xgb_clf.best_params_

In [None]:
## The best score is
optimized_xgb_clf.best_score_ # that's the training score so not meaningful

In [None]:
# optimized_xgb_clf.cv_results_

In [None]:
# make predictions for test data

y_pr_pred = optimized_xgb_clf.predict(X_pr_test)

Evaluate predictions

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_pr_test, y_pr_pred)
rmse = np.sqrt(mse)
rmse


In [None]:
y_pr_test[1:5]
y_pr_pred[1:5]