# Setup

Make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline

In [72]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # for retina screens
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

Configure notebook to display all results in cell

In [73]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Revert to the last line of output only
# InteractiveShell.ast_node_interactivity = "last_expr"

Label count encode:

In [74]:
class LabelCountEncoder(object):
    def __init__(self):
        self.count_dict = {}
    
    def fit(self, column):
        # This gives you a dictionary with level as the key and counts as the value
        count = column.value_counts().to_dict()
        # We want to rank the key by its value and use the rank as the new value
        self.count_dict = {key[0]: rank+1 for rank, key in enumerate(sorted(count.items(), key=lambda x: x[1]))}
    
    def transform(self, column):
        # If a category only appears in the test set, we will assign the value to zero.
        missing = 0
        return column.apply(lambda x : self.count_dict.get(x, missing))
    
    def fit_transform(self, column):
        self.fit(column)
        return self.transform(column)

# Load Data

In [75]:
import pandas as pd

houses_train = pd.read_csv('../Data/features_houses_train.csv')
houses_test = pd.read_csv('../Data/features_houses_test.csv')

#houses_train = pd.read_csv('../Data/encoded_houses_train.csv')
#houses_test = pd.read_csv('../Data/encoded_houses_test.csv')

In [76]:
print("houses_train dimensions: {}".format(houses_train.shape))
print("houses_test dimensions: {}".format(houses_test.shape))

houses_train dimensions: (1460, 85)
houses_test dimensions: (1459, 85)


In [77]:
pd.set_option("display.max_columns", 400)
houses_train.head(3)

Unnamed: 0.1,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,X1stFlrSF,X2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition,Garage.interaction,Basement.interaction,Kitchen.interaction,new.old,Room.size,full.YrSold,QuarterSold,TotalBath,AvgHouseLivArea.ratio,SalePrice
0,1,60,RL,65.0,8450,,Reg,Lvl,Inside,Gtl,CollgCr,Norm,1Fam,2Story,4,6,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196,4,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,5,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,4,8,Typ,0,,Attchd,2003,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,WD,Normal,2.TA,Gd.1,1.4,2.163265,214,200802,1,3.5,0.932072,208500.0
1,2,20,RL,80.0,9600,,Reg,Lvl,FR2,Gtl,Veenker,Feedr,1Fam,1Story,5,3,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0,3,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,5,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,3,6,Typ,1,TA,Attchd,1976,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,WD,Normal,2.TA,Gd.1,1.3,-2.583333,210,200705,2,2.5,0.742753,181500.0
2,3,60,RL,68.0,11250,,IR1,Lvl,Inside,Gtl,CollgCr,Norm,1Fam,2Story,4,6,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162,4,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,5,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,4,6,Typ,1,TA,Attchd,2001,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,WD,Normal,2.TA,Gd.1,1.4,0.163265,298,200809,3,3.5,0.973497,223500.0


In [78]:
houses_train.fillna(0)
houses_test.fillna(0)

Unnamed: 0.1,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,X1stFlrSF,X2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition,Garage.interaction,Basement.interaction,Kitchen.interaction,new.old,Room.size,full.YrSold,QuarterSold,TotalBath,AvgHouseLivArea.ratio,SalePrice
0,1,60,RL,65.000000,8450,0,Reg,Lvl,Inside,Gtl,CollgCr,Norm,1Fam,2Story,4,6,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196,4,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,5,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,4,8,Typ,0,0,Attchd,2003,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,0,0,0,0,WD,Normal,2.TA,Gd.1,1.4,2.163265,214,200802,1,3.5,0.932072,208500.0
1,2,20,RL,80.000000,9600,0,Reg,Lvl,FR2,Gtl,Veenker,Feedr,1Fam,1Story,5,3,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0,3,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,5,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,3,6,Typ,1,TA,Attchd,1976,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,0,0,0,0,WD,Normal,2.TA,Gd.1,1.3,-2.583333,210,200705,2,2.5,0.742753,181500.0
2,3,60,RL,68.000000,11250,0,IR1,Lvl,Inside,Gtl,CollgCr,Norm,1Fam,2Story,4,6,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162,4,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,5,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,4,6,Typ,1,TA,Attchd,2001,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,0,0,0,0,WD,Normal,2.TA,Gd.1,1.4,0.163265,298,200809,3,3.5,0.973497,223500.0
3,4,70,RL,60.000000,9550,0,IR1,Lvl,Corner,Gtl,Crawfor,Norm,1Fam,2Story,4,6,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0,3,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,4,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,4,7,Typ,1,Gd,Detchd,1998,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,0,0,0,0,WD,Abnorml,3.TA,TA.1,1.4,-12.935484,245,200602,1,2.0,0.943106,140000.0
4,5,60,RL,84.000000,14260,0,IR1,Lvl,FR2,Gtl,NoRidge,Norm,1Fam,2Story,3,6,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350,4,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,5,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,4,9,Typ,1,TA,Attchd,2000,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,0,0,0,0,WD,Normal,3.TA,Gd.1,1.4,4.524590,244,200812,4,3.5,0.878388,250000.0
5,6,50,RL,85.000000,14115,0,IR1,Lvl,Inside,Gtl,Mitchel,Norm,1Fam,1.5Fin,6,6,1993,1995,Gable,CompShg,VinylSd,VinylSd,,0,3,TA,Other,Gd,TA,No,GLQ,732,Unf,0,64,796,GasA,5,Y,SBrkr,796,566,0,1362,1,0,1,1,1,1,3,5,Typ,0,0,Attchd,1993,Unf,2,480,TA,TA,Y,40,30,0,320,0,0,0,MnPrv,Shed,700,WD,Normal,2.TA,Gd.1,1.3,16.750000,272,200910,4,2.5,0.808309,143000.0
6,7,20,RL,75.000000,10084,0,Reg,Lvl,Inside,Gtl,Somerst,Norm,1Fam,1Story,3,6,2004,2005,Gable,CompShg,VinylSd,VinylSd,Stone,186,4,TA,PConc,Ex,TA,Av,GLQ,1369,Unf,0,317,1686,GasA,5,Y,SBrkr,1694,0,0,1694,1,0,2,0,3,1,4,7,Typ,1,Gd,Attchd,2004,RFn,2,636,TA,TA,Y,255,57,0,0,0,0,0,0,0,0,WD,Normal,2.TA,Ex.1,1.4,-2.385714,242,200708,3,3.0,1.097689,307000.0
7,8,60,RL,72.246811,10382,0,IR1,Lvl,Corner,Gtl,NWAmes,PosN,1Fam,2Story,4,5,1973,1973,Gable,CompShg,HdBoard,HdBoard,Stone,240,3,TA,CBlock,Gd,TA,Mn,ALQ,859,BLQ,32,216,1107,GasA,5,Y,SBrkr,1107,983,0,2090,1,0,2,1,3,1,3,7,Typ,2,TA,Attchd,1973,RFn,2,484,TA,TA,Y,235,204,228,0,0,0,0,0,Shed,350,WD,Normal,2.TA,Gd.1,1.3,-3.408163,299,200911,4,3.5,1.036046,200000.0
8,9,50,RM,51.000000,6120,0,Reg,Lvl,Inside,Gtl,OldTown,Artery,1Fam,1.5Fin,4,6,1931,1950,Gable,CompShg,Brick,Wd Shng,,0,3,TA,BrkTil,TA,TA,No,Unf,0,Unf,0,952,952,GasA,4,Y,FuseFP,1022,752,0,1774,0,0,2,0,2,2,3,8,Min1,2,TA,Detchd,1931,Unf,2,468,FaPo,TA,Y,90,0,205,0,0,0,0,0,0,0,WD,Abnorml,2.FaPo,TA.0,2.3,7.608696,222,200804,2,2.0,1.223265,129900.0
9,10,190,RL,50.000000,7420,0,Reg,Lvl,Corner,Gtl,BrkSide,Artery,2fmCon,1.5Fin,6,5,1939,1950,Gable,CompShg,MetalSd,MetalSd,,0,3,TA,BrkTil,TA,TA,No,GLQ,851,Unf,0,140,991,GasA,5,Y,SBrkr,1077,0,0,1077,1,0,1,0,2,2,3,5,Typ,2,TA,Attchd,1939,RFn,1,205,ExGd,TA,Y,0,4,0,0,0,0,0,0,0,0,WD,Normal,2.FaPo,TA.1,2.3,-15.500000,215,200801,1,2.0,1.057437,118000.0


Unnamed: 0.1,Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,X1stFlrSF,X2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,X3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,SaleType,SaleCondition,Garage.interaction,Basement.interaction,Kitchen.interaction,new.old,Room.size,full.YrSold,QuarterSold,TotalBath,AvgHouseLivArea.ratio,SalePrice
0,1,20,RH,80.000000,11622,0,Reg,Lvl,Inside,Gtl,NAmes,Feedr,1Fam,1Story,6,5,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0,3,TA,CBlock,TA,TA,No,Rec,468,LwQ,144,270,882,GasA,3,Y,SBrkr,896,0,0,896,0,0,1,0,2,1,3,5,Typ,0,0,Attchd,1961,Unf,1,730,TA,TA,Y,140,0,0,0,120,0,0,MnPrv,0,0,WD,Normal,1.TA,TA.0,1.3,0.972973,179,201006,2,1.0,0.757176,0
1,2,20,RL,81.000000,14267,0,IR1,Lvl,Corner,Gtl,NAmes,Norm,1Fam,1Story,5,5,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108,3,TA,CBlock,TA,TA,No,ALQ,923,Unf,0,406,1329,GasA,3,Y,SBrkr,1329,0,0,1329,0,0,1,1,3,1,4,6,Typ,0,0,Attchd,1958,Unf,1,312,TA,TA,Y,393,36,0,0,0,0,0,0,0,12500,WD,Normal,1.TA,TA.0,1.4,-2.027027,222,201006,2,1.5,1.123088,0
2,3,60,RL,74.000000,13830,0,IR1,Lvl,Inside,Gtl,Gilbert,Norm,1Fam,2Story,6,6,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0,3,TA,PConc,Gd,TA,No,GLQ,791,Unf,0,137,928,GasA,4,Y,SBrkr,928,701,0,1629,0,0,2,1,3,1,3,6,Typ,1,TA,Attchd,1997,Fin,2,482,TA,TA,Y,212,34,0,0,0,0,0,MnPrv,0,0,WD,Normal,2.TA,Gd.0,1.3,-3.025210,272,201003,1,2.5,0.948163,0
3,4,60,RL,78.000000,9978,0,IR1,Lvl,Inside,Gtl,Gilbert,Norm,1Fam,2Story,5,5,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20,3,TA,PConc,TA,TA,No,GLQ,602,Unf,0,324,926,GasA,5,Y,SBrkr,926,678,0,1604,0,0,2,1,3,1,4,7,Typ,1,Gd,Attchd,1998,Fin,2,470,TA,TA,Y,360,36,0,0,0,0,0,0,0,0,WD,Normal,2.TA,TA.0,1.4,-2.025210,229,201006,2,2.5,0.933612,0
4,5,120,RL,43.000000,5005,0,IR1,HLS,Inside,Gtl,StoneBr,Norm,TwnhsE,1Story,3,6,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0,4,TA,PConc,Gd,TA,No,ALQ,263,Unf,0,1017,1280,GasA,5,Y,SBrkr,1280,0,0,1280,0,0,2,0,2,1,4,5,Typ,0,0,Attchd,1992,RFn,2,506,TA,TA,Y,0,82,0,0,144,0,0,0,0,0,WD,Normal,2.TA,Gd.0,1.4,-2.500000,256,201001,1,2.0,0.849370,0
5,6,60,RL,75.000000,10000,0,IR1,Lvl,Corner,Gtl,Gilbert,Norm,1Fam,2Story,5,6,1993,1994,Gable,CompShg,HdBoard,HdBoard,,0,3,TA,PConc,Gd,TA,No,Unf,0,Unf,0,763,763,GasA,4,Y,SBrkr,763,892,0,1655,0,0,2,1,3,1,3,7,Typ,1,TA,Attchd,1993,Fin,2,440,TA,TA,Y,157,84,0,0,0,0,0,0,0,0,WD,Normal,2.TA,Gd.0,1.3,-7.025210,236,201004,2,2.5,0.963296,0
6,7,20,RL,62.830887,7980,0,IR1,Lvl,Inside,Gtl,Gilbert,Norm,1Fam,1Story,5,4,1992,2007,Gable,CompShg,HdBoard,HdBoard,,0,3,Gd,PConc,Gd,TA,No,ALQ,935,Unf,0,233,1168,GasA,5,Y,SBrkr,1187,0,0,1187,1,0,2,0,3,1,3,6,Typ,0,0,Attchd,1992,Fin,2,420,TA,TA,Y,483,21,0,0,0,0,0,GdPrv,Shed,500,WD,Normal,2.TA,Gd.1,1.3,-0.250000,198,201003,1,3.0,0.892006,0
7,8,60,RL,63.000000,8402,0,IR1,Lvl,Inside,Gtl,Gilbert,Norm,1Fam,2Story,5,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,,0,3,TA,PConc,Gd,TA,No,Unf,0,Unf,0,789,789,GasA,4,Y,SBrkr,789,676,0,1465,0,0,2,1,3,1,3,7,Typ,1,Gd,Attchd,1998,Fin,2,393,TA,TA,Y,0,75,0,0,0,0,0,0,0,0,WD,Normal,2.TA,Gd.0,1.3,-2.025210,209,201005,2,2.5,0.852707,0
8,9,20,RL,85.000000,10176,0,Reg,Lvl,Inside,Gtl,Gilbert,Norm,1Fam,1Story,4,6,1990,1990,Gable,CompShg,HdBoard,HdBoard,,0,3,TA,PConc,Gd,TA,Gd,GLQ,637,Unf,0,663,1300,GasA,4,Y,SBrkr,1341,0,0,1341,1,0,1,1,2,1,4,5,Typ,1,Po,Attchd,1990,Unf,2,506,TA,TA,Y,192,0,0,0,0,0,0,0,0,0,WD,Normal,2.TA,Gd.1,1.4,-2.250000,268,201002,1,2.5,1.007734,0
9,10,20,RL,70.000000,8400,0,Reg,Lvl,Corner,Gtl,NAmes,Norm,1Fam,1Story,7,6,1970,1970,Gable,CompShg,Plywood,Plywood,,0,3,TA,CBlock,TA,TA,No,ALQ,804,Rec,78,0,882,GasA,3,Y,SBrkr,882,0,0,882,1,0,1,0,2,1,3,4,Typ,0,0,Attchd,1970,Fin,2,525,TA,TA,Y,240,0,0,0,0,0,0,MnPrv,0,0,WD,Normal,2.TA,TA.1,1.3,9.972973,220,201004,2,2.0,0.745345,0


In [79]:
#houses_train.info()
#houses_test.info()

Delete first column

In [80]:
houses_train.drop('Unnamed: 0', axis=1, inplace=True, errors='raise')
houses_test.drop('Unnamed: 0', axis=1, inplace=True, errors='raise')

In [81]:
for c in houses_train.columns:
    if houses_train[c].dtype == 'object':
        lce = LabelCountEncoder()
        houses_train[c] = lce.fit_transform(houses_train[c])

In [82]:
for c in houses_test.columns:
    if houses_test[c].dtype == 'object':
        lce = LabelCountEncoder()
        houses_test[c] = lce.fit_transform(houses_test[c])

# Run xgboost

### Create private training & test set

In [83]:
from sklearn.model_selection import train_test_split

seed = 10
test_ratio = 0.2

X = houses_train.loc[:, houses_train.columns != "SalePrice"].values # convert to np.array
y = houses_train.loc[:, houses_train.columns == "SalePrice"].values

# Take log of SalePrice
y = np.log(y + 1).ravel() # convert to 1D array for model fit (xxx, )


In [84]:
X_pr_train, X_pr_test, y_pr_train, y_pr_test = train_test_split(X, y, test_size=test_ratio, random_state=seed)

In [85]:
print(len(X_pr_train), "train +", len(X_pr_test), "test")

1168 train + 292 test


### Fit Model

In [86]:
from xgboost import XGBRegressor

xgb_clf = XGBRegressor(max_depth=3, 
                        learning_rate=0.05, 
                        n_estimators=1000, # Number of boosted trees to fit
                        silent=False, # print messages while running 
                        objective='reg:linear', 
                        booster='gbtree', # Specify which booster to use: gbtree, gblinear or dart
                        #for dart see http://xgboost.readthedocs.io/en/latest/tutorials/dart.html 
                        n_jobs=-1, # Number of parallel threads used to run xgboost. (replaces nthread)
                        gamma=0,  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                        min_child_weight=1, # Minimum sum of instance weight(hessian) needed in a child
                        max_delta_step=0, # Maximum delta step we allow each tree’s weight estimation to be
                        subsample=1, # Subsample ratio of the training instance
                        colsample_bytree=1, # Subsample ratio of columns when constructing each tree
                        colsample_bylevel=1, # Subsample ratio of columns for each split, in each level
                        reg_alpha=0, # L1 regularization term on weights
                        reg_lambda=1, # L2 regularization term on weights
                        scale_pos_weight=1, # Balancing of positive and negative weights
                        base_score=0.5, # The initial prediction score of all instances, global bias
                        random_state=743, 
                        missing=None) # Value in the data which needs to be present as a missing value. If None, defaults to np.nan



In [87]:
xgb_clf.fit(X_pr_train, y_pr_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=743,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

In [88]:
# make predictions for test data

y_pr_pred = xgb_clf.predict(X_pr_test)

Evaluate predictions

In [89]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_pr_test, y_pr_pred)
rmse = np.sqrt(mse)
rmse


0.13217551471760219

In [66]:
y_pr_test[1:5]
y_pr_pred[1:5]

array([ 12.1428719 ,  11.8277435 ,  12.01067193,  12.64109979])

array([ 12.20700264,  11.77560329,  11.87111568,  12.63988113], dtype=float32)

Save model to file

In [67]:
from sklearn.externals import joblib # More memory efficient than pickle for large numpy arrays

joblib.dump(xgb_clf, './Models/xgboost_model.pkl') 

['./Models/xgboost_model.pkl']

To load a model:

In [68]:
# xgb_clf_loaded = joblib.load('./Models/xgboost_model.pkl') 

### Model Tuning

In [69]:
from sklearn.model_selection import GridSearchCV

xgb_params = {'max_depth': 3,
              'learning_rate': 0.1, 
              'n_estimators': 100, 
              'objective': 'reg:linear'}

# param_grid = {'max_depth': [3], 
#               'learning_rate': [0.05], 
#               'n_estimators': [2000], # Number of boosted trees to fit
#               'objective': ['reg:linear'], 
#               'booster': ['gbtree'], # Specify which booster to use: gbtree, gblinear or dart
#               'n_jobs': [-1], # Number of parallel threads used to run xgboost. (replaces nthread)
#               'gamma': [0],  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
#               'min_child_weight': [1], # Minimum sum of instance weight(hessian) needed in a child
#               'max_delta_step': [0], # Maximum delta step we allow each tree’s weight estimation to be
#               'subsample': [1], # Subsample ratio of the training instance
#               'colsample_bytree': [0.5], # Subsample ratio of columns when constructing each tree
#               'colsample_bylevel': [0.4], # Subsample ratio of columns for each split, in each level
#               'reg_alpha': [0], # L1 regularization term on weights
#               'reg_lambda': [1], # L2 regularization term on weights
#               'scale_pos_weight': [1], # Balancing of positive and negative weights
#               'base_score': [0.5], # The initial prediction score of all instances, global bias
#               'silent': [True],
#               'random_state': [10]}

param_grid = {'max_depth': [3, 5, 9], 
              'learning_rate': [0.03, 0.05, 0.07], 
              'n_estimators': [1700, 2000, 2500], # Number of boosted trees to fit
              'objective': ['reg:linear'], 
              'booster': ['gbtree'], # Specify which booster to use: gbtree, gblinear or dart
              'n_jobs': [-1], # Number of parallel threads used to run xgboost. (replaces nthread)
              'gamma': [0, 0.1, 0.5],  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
              'min_child_weight': [1], # Minimum sum of instance weight(hessian) needed in a child
              'max_delta_step': [0], # Maximum delta step we allow each tree’s weight estimation to be
              'subsample': [1], # Subsample ratio of the training instance
              'colsample_bytree': [0.6, 0.8, 1], # Subsample ratio of columns when constructing each tree
              'colsample_bylevel': [0.3, 0.5, 0.7, 0.85, 1], # Subsample ratio of columns for each split, in each level
              'reg_alpha': [0], # L1 regularization term on weights
              'reg_lambda': [1], # L2 regularization term on weights
              'scale_pos_weight': [1], # Balancing of positive and negative weights
              'base_score': [0.5], # The initial prediction score of all instances, global bias
              'silent': [True],
              'random_state': [10]}

optimized_xgb_clf = GridSearchCV(XGBRegressor(**xgb_params), # scikit-learn estimator interface 
                                 param_grid = param_grid, # Dictionary with parameters names (string) as keys
                                 scoring="neg_mean_squared_error", # controls what metric they apply to the estimators evaluated
                                 n_jobs=-1, # If True, the data is assumed to be identically distributed across the folds,
                                 iid=True, 
                                 refit=True, # Refit an estimator using the best found parameters (best_estimator_)
                                 cv=3, #integer, to specify the number of folds in a (Stratified)KFold. None -> default 3-fold cross validation
                                 verbose=10, # the higher, the more messages
                                 pre_dispatch="2*n_jobs", # number of jobs that get dispatched during parallel execution
                                 error_score="raise", 
                                 return_train_score=False) #If False, the cv_results_ attribute will not include training scores

Inspect the grid

In [70]:
optimized_xgb_clf

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 5, 9], 'learning_rate': [0.03, 0.05, 0.07], 'n_estimators': [1700, 2000, 2500], 'objective': ['reg:linear'], 'booster': ['gbtree'], 'n_jobs': [-1], 'gamma': [0, 0.1, 0.5], 'min_child_weight': [1], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [0.6, 0.8, 1], 'colsample_bylevel': [0.3, 0.5, 0.7, 0.85, 1], 'reg_alpha': [0], 'reg_lambda': [1], 'scale_pos_weight': [1], 'base_score': [0.5], 'silent': [True], 'random_state': [10]},
       pre_dispatch='2*n_jobs', ref

Run grid tuning

In [71]:
optimized_xgb_clf.fit(X_pr_train, y_pr_train)

Fitting 3 folds for each of 1215 candidates, totalling 3645 fits
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1700, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1700, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1700, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, bo

KeyboardInterrupt: 

The best combination of parameters is:

In [None]:
optimized_xgb_clf.best_params_

In [None]:
## The best score is
optimized_xgb_clf.best_score_ # that's the training score so not meaningful

In [None]:
# optimized_xgb_clf.cv_results_

In [None]:
# make predictions for test data

y_pr_pred = optimized_xgb_clf.predict(X_pr_test)

Evaluate predictions

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_pr_test, y_pr_pred)
rmse = np.sqrt(mse)
rmse


In [None]:
y_pr_test[1:5]
y_pr_pred[1:5]