# Setup

Make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline

In [836]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # for retina screens
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

Configure notebook to display all results in cell

In [837]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Revert to the last line of output only
# InteractiveShell.ast_node_interactivity = "last_expr"

# Load Data

In [838]:
import pandas as pd

houses_train = pd.read_csv('../Data/encoded_houses_train.csv')
houses_test = pd.read_csv('../Data/encoded_houses_test.csv')

In [839]:
print("houses_train dimensions: {}".format(houses_train.shape))
print("houses_test dimensions: {}".format(houses_test.shape))

houses_train dimensions: (1460, 288)
houses_test dimensions: (1459, 288)


In [840]:
pd.set_option("display.max_columns", 400)
houses_train.head(3)

Unnamed: 0,MSSubClass_lev_x.120,MSSubClass_lev_x.160,MSSubClass_lev_x.190,MSSubClass_lev_x.30,MSSubClass_lev_x.50,MSSubClass_lev_x.60,MSSubClass_lev_x.90,MSSubClass_catP,MSSubClass_catN,MSSubClass_catD,MSZoning_lev_x.FV,MSZoning_lev_x.RL,MSZoning_lev_x.RM,MSZoning_catP,MSZoning_catN,MSZoning_catD,LotFrontage_clean,LotArea_clean,Alley_lev_x.Grvl,Alley_lev_x.NA,Alley_catP,Alley_catN,Alley_catD,LotShape_lev_x.IR1,LotShape_lev_x.IR2,LotShape_lev_x.Reg,LotShape_catP,LotShape_catN,LotShape_catD,LandContour_lev_x.Bnk,LandContour_lev_x.HLS,LandContour_catN,LandContour_catD,LotConfig_lev_x.CulDSac,LotConfig_lev_x.Inside,LotConfig_catP,LotConfig_catN,LandSlope_catP,Neighborhood_lev_x.BrkSide,Neighborhood_lev_x.CollgCr,Neighborhood_lev_x.Crawfor,Neighborhood_lev_x.Edwards,Neighborhood_lev_x.IDOTRR,Neighborhood_lev_x.Mitchel,Neighborhood_lev_x.NAmes,Neighborhood_lev_x.NoRidge,Neighborhood_lev_x.NridgHt,Neighborhood_lev_x.OldTown,Neighborhood_lev_x.Sawyer,Neighborhood_lev_x.Somerst,Neighborhood_lev_x.Timber,Neighborhood_catP,Neighborhood_catN,Neighborhood_catD,Condition1_lev_x.Artery,Condition1_lev_x.Feedr,Condition1_lev_x.Norm,Condition1_catP,Condition1_catN,Condition1_catD,BldgType_lev_x.1Fam,BldgType_lev_x.2fmCon,BldgType_lev_x.Duplex,BldgType_lev_x.Twnhs,BldgType_catP,BldgType_catN,BldgType_catD,HouseStyle_lev_x.1.5Fin,HouseStyle_lev_x.1Story,HouseStyle_lev_x.2Story,HouseStyle_lev_x.SFoyer,HouseStyle_catP,HouseStyle_catN,HouseStyle_catD,OverallQual_clean,OverallCond_clean,YearBuilt_clean,YearRemodAdd_clean,RoofStyle_lev_x.Gable,RoofStyle_lev_x.Hip,RoofStyle_catP,RoofStyle_catN,RoofStyle_catD,RoofMatl_lev_x.CompShg,RoofMatl_catP,RoofMatl_catN,Exterior1st_lev_x.Cement,Exterior1st_lev_x.HdBoard,Exterior1st_lev_x.MetalSd,Exterior1st_lev_x.Shingles,Exterior1st_lev_x.VinylSd,Exterior1st_lev_x.Wd.Sdng,Exterior1st_catP,Exterior1st_catN,Exterior1st_catD,Exterior2nd_lev_x.Cement,Exterior2nd_lev_x.HdBoard,Exterior2nd_lev_x.MetalSd,Exterior2nd_lev_x.Plywood,Exterior2nd_lev_x.VinylSd,Exterior2nd_lev_x.Wd.Sdng,Exterior2nd_catP,Exterior2nd_catN,Exterior2nd_catD,MasVnrType_lev_x.BrkFace,MasVnrType_lev_x.None,MasVnrType_lev_x.Stone,MasVnrType_catP,MasVnrType_catN,MasVnrType_catD,MasVnrArea_clean,ExterQual_lev_x.3,ExterQual_lev_x.4,ExterQual_lev_x.5,ExterQual_catP,ExterQual_catN,ExterQual_catD,ExterCond_lev_x.TA,ExterCond_catP,ExterCond_catN,ExterCond_catD,Foundation_lev_x.BrkTil,Foundation_lev_x.CBlock,Foundation_lev_x.PConc,Foundation_catP,Foundation_catN,Foundation_catD,BsmtQual_lev_x.Ex,BsmtQual_lev_x.Fa,BsmtQual_lev_x.Gd,BsmtQual_lev_x.NA,BsmtQual_lev_x.TA,BsmtQual_catP,BsmtQual_catN,BsmtQual_catD,BsmtCond_lev_x.Fa,BsmtCond_lev_x.Gd,BsmtCond_lev_x.NA,BsmtCond_lev_x.TA,BsmtCond_catP,BsmtCond_catN,BsmtCond_catD,BsmtExposure_lev_x.Av,BsmtExposure_lev_x.Gd,BsmtExposure_lev_x.NA,BsmtExposure_lev_x.No,BsmtExposure_catP,BsmtExposure_catN,BsmtExposure_catD,BsmtFinType1_lev_x.ALQ,BsmtFinType1_lev_x.BLQ,BsmtFinType1_lev_x.GLQ,BsmtFinType1_lev_x.LwQ,BsmtFinType1_lev_x.NA,BsmtFinType1_lev_x.Rec,BsmtFinType1_lev_x.Unf,BsmtFinType1_catP,BsmtFinType1_catN,BsmtFinType1_catD,BsmtFinSF1_clean,BsmtFinType2_lev_x.BLQ,BsmtFinType2_lev_x.NA,BsmtFinType2_lev_x.Unf,BsmtFinType2_catP,BsmtFinType2_catN,BsmtFinType2_catD,BsmtUnfSF_clean,TotalBsmtSF_clean,Heating_lev_x.GasA,Heating_catP,Heating_catN,Heating_catD,HeatingQC_lev_x.2,HeatingQC_lev_x.3,HeatingQC_lev_x.4,HeatingQC_lev_x.5,HeatingQC_catP,HeatingQC_catN,HeatingQC_catD,CentralAir_lev_x.N,CentralAir_lev_x.Y,Electrical_lev_x.FuseA,Electrical_lev_x.FuseFP,Electrical_lev_x.SBrkr,Electrical_catP,Electrical_catN,Electrical_catD,X1stFlrSF_clean,X2ndFlrSF_clean,GrLivArea_clean,BsmtFullBath_clean,FullBath_clean,HalfBath_clean,BedroomAbvGr_clean,KitchenAbvGr_clean,KitchenQual_lev_x.2,KitchenQual_lev_x.3,KitchenQual_lev_x.4,KitchenQual_lev_x.5,KitchenQual_catP,KitchenQual_catN,KitchenQual_catD,TotRmsAbvGrd_clean,Functional_lev_x.Min1,Functional_lev_x.Min2,Functional_lev_x.Typ,Functional_catP,Functional_catN,Fireplaces_clean,FireplaceQu_lev_x.Gd,FireplaceQu_lev_x.NA,FireplaceQu_lev_x.TA,FireplaceQu_catP,FireplaceQu_catN,FireplaceQu_catD,GarageType_lev_x.Attchd,GarageType_lev_x.BuiltIn,GarageType_lev_x.Detchd,GarageType_lev_x.NA,GarageType_catP,GarageType_catN,GarageType_catD,GarageYrBlt_clean,GarageFinish_lev_x.Fin,GarageFinish_lev_x.NA,GarageFinish_lev_x.RFn,GarageFinish_lev_x.Unf,GarageFinish_catP,GarageFinish_catN,GarageFinish_catD,GarageCars_clean,GarageArea_clean,GarageQual_lev_x.FaPo,GarageQual_lev_x.NA,GarageQual_lev_x.TA,GarageQual_catP,GarageQual_catN,GarageQual_catD,GarageCond_lev_x.FaPo,GarageCond_lev_x.NA,GarageCond_lev_x.TA,GarageCond_catP,GarageCond_catN,GarageCond_catD,PavedDrive_lev_x.N,PavedDrive_lev_x.P,PavedDrive_lev_x.Y,PavedDrive_catP,PavedDrive_catN,PavedDrive_catD,WoodDeckSF_clean,OpenPorchSF_clean,EnclosedPorch_clean,ScreenPorch_clean,PoolArea_clean,PoolQC_lev_x.NA,Fence_lev_x.GdWo,Fence_lev_x.MnPrv,Fence_lev_x.NA,Fence_catP,Fence_catN,Fence_catD,MiscFeature_lev_x.NA,MiscFeature_lev_x.Shed,SaleType_lev_x.COD,SaleType_lev_x.New,SaleType_lev_x.WD,SaleType_catP,SaleType_catN,SaleType_catD,SaleCondition_lev_x.Abnorml,SaleCondition_lev_x.Normal,SaleCondition_lev_x.Partial,SaleCondition_catP,SaleCondition_catN,SaleCondition_catD,Garage.interaction_lev_x.0.NA,Garage.interaction_lev_x.1.FaPo,Garage.interaction_lev_x.1.TA,Garage.interaction_lev_x.3.TA,Garage.interaction_catN,Garage.interaction_catD,new.old_clean,Room.size_clean,TotalBath_clean,BathToBed_clean,AvgHouseLivArea.ratio_clean,SalePrice
0,-1258.292759,2486.995169,1076.388725,4220.928481,4819.768834,56614.048554,1749.833939,934.665034,56614.048554,34358.552713,-1541.961482,10083.798897,9584.341147,9547.439806,10083.798897,10654.166247,-6327.176048,-4340.281002,2081.635315,2530.935592,2525.780092,2530.935592,2587.530377,-12487.036054,-1702.18391,-16166.377512,-15660.656008,-16166.377512,-16066.433479,1705.424725,-1794.77816,-737.449132,524.941057,-2954.42283,-3983.148362,-4364.093518,-3877.639435,-966.528866,2320.29555,17044.577443,-1075.1455,3875.109992,2100.846274,856.061374,6390.021114,-4460.422106,-7535.723728,4420.665283,2356.043648,-2782.709719,-1638.816847,-12260.754646,17844.445823,-6926.37525,1557.944336,2258.232681,3574.296173,3515.263457,3744.534362,3385.879452,4842.611487,1138.66835,1749.833939,1365.851393,4985.550538,4923.820824,4456.649486,5272.974388,4881.922638,29130.568155,1192.078881,-1761.55171,29167.216338,16580.913802,40923.342747,3197.819797,43643.614655,35385.344572,-9437.239712,-9246.45739,-9018.959469,-9497.234142,-9437.239712,-1117.516671,-1116.650229,-1083.73243,-2160.955547,3199.766953,5588.535561,1655.02704,32811.70508,5105.572052,25419.741384,32205.658287,2025.615057,-2054.63692,2190.537549,5344.507962,1380.008207,33718.328862,5074.758979,27222.653463,33677.680038,9080.830661,23770.67602,35536.895946,-8135.728923,26471.884255,23770.67602,27515.768886,19295.767973,59821.973785,50712.314355,-6885.559527,47646.354997,50712.314355,50349.039084,3113.700365,3227.789237,3113.700365,2642.97741,5403.346728,23882.827112,44309.24615,8144.454889,44257.582638,42782.104788,-13204.257877,1602.119899,21767.283074,1957.081692,32139.006329,-7413.626966,21767.283074,8744.627019,2048.030578,-1522.663991,1957.081692,2711.42501,2762.773722,2711.42501,4282.955527,-4588.064333,-7757.913839,1950.315361,-15268.899983,-14681.52957,-15268.899983,-16137.21027,3432.732335,3545.179872,54492.524205,1551.997472,1957.081692,3410.88851,4279.384692,19736.353379,54492.524205,44665.049892,17658.431224,689.605091,1885.96937,3773.494396,3761.968586,3774.078826,3441.936203,-16089.136948,-22380.746589,1099.999488,1101.240078,924.168422,908.434815,1979.506448,15991.241125,4757.194594,33993.233259,28916.076316,33993.233259,33908.651333,5265.514,5265.514,4041.057404,1639.132311,5889.441563,5901.154158,5889.441563,5906.37908,-38175.216474,29464.541341,20840.743798,19981.547773,35163.415006,27697.387673,2187.875556,2282.244622,2068.179901,41523.631696,31194.828,-10855.402508,5393.399003,31194.828,23249.592677,38664.722845,749.200191,874.571291,2507.951168,2506.221941,2379.675505,-35272.705133,-15984.707002,-39589.713282,-6768.193275,-32923.193386,-39899.668927,-34387.950989,21971.460431,-4735.486707,16890.23561,4558.31535,16961.429767,21971.460431,22174.306211,40645.175998,-18785.456721,4558.31535,21147.673778,27430.043876,4966.511788,21147.673778,1411.252803,15853.780274,17378.022268,2125.592612,4558.31535,6568.640113,6502.69881,6568.640113,6517.972311,1993.158835,4558.31535,6964.539404,6933.732785,6880.55073,6820.710461,4328.019438,1019.395718,5512.77799,5496.481178,5512.77799,5341.11103,-19378.59825,5430.725119,3669.086701,-2391.495786,-504.07755,-516.532435,1557.085048,3876.213933,6675.642108,6615.438697,6595.263972,5124.05458,1032.562437,1032.562437,1121.214836,-8573.210091,-7402.291878,-7086.641636,-7402.291878,-5966.677194,2790.432892,-5718.976358,-8555.295516,-5479.923422,-5830.38026,-8109.482244,4558.31535,1627.458604,14942.553004,-18081.252053,2844.401106,-1827.068929,1944.137004,-16230.334931,83113.381803,3902.056107,-8907.811262,208500.0
1,-1258.292759,2486.995169,1076.388725,4220.928481,4819.768834,-15575.043925,1749.833939,19991.111383,4088.061517,19982.313025,-1541.961482,10083.798897,9584.341147,9547.439806,10083.798897,10654.166247,8977.032177,-1925.313257,2081.635315,2530.935592,2525.780092,2530.935592,2587.530377,-12487.036054,-1702.18391,-16166.377512,-15660.656008,-16166.377512,-16066.433479,1705.424725,-1794.77816,-737.449132,524.941057,-2954.42283,10270.274698,13931.27412,105.851916,-966.528866,2320.29555,-1951.669173,-1075.1455,3875.109992,2100.846274,856.061374,6390.021114,-4460.422106,-7535.723728,4420.665283,2356.043648,-2782.709719,-1638.816847,21675.382122,58658.941416,19168.389839,1557.944336,-38445.714409,-22518.06589,-21563.748126,-38281.570851,-34434.226,4842.611487,1138.66835,1749.833939,1365.851393,4985.550538,4923.820824,4456.649486,5272.974388,-4935.717929,-12771.529881,1192.078881,4833.577004,-4899.156999,3141.069794,-4512.459847,-13476.526287,6508.531021,-17299.739392,-9437.239712,-9246.45739,-9018.959469,-9497.234142,-9437.239712,-1117.516671,-1116.650229,-1083.73243,-2160.955547,3199.766953,-31499.018618,1655.02704,-17881.511234,5105.572052,-8241.802844,-32132.547786,-18733.95572,-2054.63692,2190.537549,-31118.022993,1380.008207,-17830.111074,5074.758979,-8222.910256,-31158.793154,-18270.195564,-10421.626432,-23962.952771,-8135.728923,-24257.254871,-23962.952771,-24179.751834,-21421.861116,-36579.882425,-25460.503504,-6885.559527,-35575.217778,-36579.882425,-37222.834821,3113.700365,3227.789237,3113.700365,2642.97741,5403.346728,-31115.481379,-35262.093799,6630.164869,-31167.269831,-36391.321729,-13204.257877,1602.119899,21767.283074,1957.081692,32139.006329,-7413.626966,21767.283074,8744.627019,2048.030578,-1522.663991,1957.081692,2711.42501,2762.773722,2711.42501,4282.955527,-4588.064333,76768.61008,1950.315361,28700.71338,28995.522305,76768.61008,76599.173752,-19348.127709,3545.179872,-21859.764988,1551.997472,1957.081692,3410.88851,4279.384692,-16879.34593,-19348.127709,-30286.519967,35965.674238,689.605091,1885.96937,3773.494396,3761.968586,3774.078826,3441.936203,-10921.985602,22729.752508,1099.999488,1101.240078,924.168422,908.434815,1979.506448,15991.241125,4757.194594,33993.233259,28916.076316,33993.233259,33908.651333,5265.514,5265.514,4041.057404,1639.132311,5889.441563,5901.154158,5889.441563,5906.37908,12372.036147,-20165.329234,-27153.657019,-14789.67958,35163.415006,-17184.06183,2187.875556,2282.244622,2068.179901,-40958.684326,-20915.52541,-10855.402508,-24248.357835,-40958.684326,-38217.225028,-13507.63885,749.200191,874.571291,2507.951168,2506.221941,2379.675505,22267.126704,-15984.707002,35476.496317,24802.292927,30247.709068,24495.637857,10089.585002,21971.460431,-4735.486707,16890.23561,4558.31535,16961.429767,21971.460431,22174.306211,-778.667375,-18785.456721,4558.31535,21147.673778,27430.043876,4966.511788,21147.673778,1411.252803,15853.780274,-3006.791809,2125.592612,4558.31535,6568.640113,6502.69881,6568.640113,6517.972311,1993.158835,4558.31535,6964.539404,6933.732785,6880.55073,6820.710461,4328.019438,1019.395718,5512.77799,5496.481178,5512.77799,5341.11103,41896.288024,-17671.127149,3669.086701,-2391.495786,-504.07755,-516.532435,1557.085048,3876.213933,6675.642108,6615.438697,6595.263972,5124.05458,1032.562437,1032.562437,1121.214836,-8573.210091,-7402.291878,-7086.641636,-7402.291878,-5966.677194,2790.432892,-5718.976358,-8555.295516,-5479.923422,-5830.38026,-8109.482244,4558.31535,1627.458604,14942.553004,-18081.252053,2844.401106,-1827.068929,-1972.055257,-20070.589205,19314.466223,-6015.433778,-32589.839388,181500.0
2,-1258.292759,2486.995169,1076.388725,4220.928481,4819.768834,56614.048554,1749.833939,934.665034,56614.048554,34358.552713,-1541.961482,10083.798897,9584.341147,9547.439806,10083.798897,10654.166247,-3266.334403,1539.640463,2081.635315,2530.935592,2525.780092,2530.935592,2587.530377,25180.469399,-1702.18391,27951.213455,23295.254551,25180.469399,25235.762331,1705.424725,-1794.77816,-737.449132,524.941057,-2954.42283,-3983.148362,-4364.093518,-3877.639435,-966.528866,2320.29555,17044.577443,-1075.1455,3875.109992,2100.846274,856.061374,6390.021114,-4460.422106,-7535.723728,4420.665283,2356.043648,-2782.709719,-1638.816847,-12260.754646,17844.445823,-6926.37525,1557.944336,2258.232681,3574.296173,3515.263457,3744.534362,3385.879452,4842.611487,1138.66835,1749.833939,1365.851393,4985.550538,4923.820824,4456.649486,5272.974388,4881.922638,29130.568155,1192.078881,-1761.55171,29167.216338,16580.913802,40923.342747,3197.819797,40892.867719,33434.045166,-9437.239712,-9246.45739,-9018.959469,-9497.234142,-9437.239712,-1117.516671,-1116.650229,-1083.73243,-2160.955547,3199.766953,5588.535561,1655.02704,32811.70508,5105.572052,25419.741384,32205.658287,2025.615057,-2054.63692,2190.537549,5344.507962,1380.008207,33718.328862,5074.758979,27222.653463,33677.680038,9080.830661,23770.67602,35536.895946,-8135.728923,26471.884255,23770.67602,27515.768886,12232.505784,59821.973785,50712.314355,-6885.559527,47646.354997,50712.314355,50349.039084,3113.700365,3227.789237,3113.700365,2642.97741,5403.346728,23882.827112,44309.24615,8144.454889,44257.582638,42782.104788,-13204.257877,1602.119899,21767.283074,1957.081692,32139.006329,-7413.626966,21767.283074,8744.627019,2048.030578,-1522.663991,1957.081692,2711.42501,2762.773722,2711.42501,4282.955527,-4588.064333,-7757.913839,1950.315361,28700.71338,30062.116978,11868.462004,10363.934213,3432.732335,3545.179872,54492.524205,1551.997472,1957.081692,3410.88851,4279.384692,19736.353379,54492.524205,44665.049892,2851.102315,689.605091,1885.96937,3773.494396,3761.968586,3774.078826,3441.936203,-5137.860961,-15269.731953,1099.999488,1101.240078,924.168422,908.434815,1979.506448,15991.241125,4757.194594,33993.233259,28916.076316,33993.233259,33908.651333,5265.514,5265.514,4041.057404,1639.132311,5889.441563,5901.154158,5889.441563,5906.37908,-30207.176652,30161.916572,28982.651079,19981.547773,35163.415006,27697.387673,2187.875556,2282.244622,2068.179901,41523.631696,31194.828,-10855.402508,5393.399003,31194.828,23249.592677,-13507.63885,749.200191,874.571291,2507.951168,2506.221941,2379.675505,22267.126704,-15984.707002,35476.496317,24802.292927,30247.709068,24495.637857,10089.585002,21971.460431,-4735.486707,16890.23561,4558.31535,16961.429767,21971.460431,22174.306211,37576.743155,-18785.456721,4558.31535,21147.673778,27430.043876,4966.511788,21147.673778,1411.252803,15853.780274,31276.759139,2125.592612,4558.31535,6568.640113,6502.69881,6568.640113,6517.972311,1993.158835,4558.31535,6964.539404,6933.732785,6880.55073,6820.710461,4328.019438,1019.395718,5512.77799,5496.481178,5512.77799,5341.11103,-19378.59825,-1764.933784,3669.086701,-2391.495786,-504.07755,-516.532435,1557.085048,3876.213933,6675.642108,6615.438697,6595.263972,5124.05458,1032.562437,1032.562437,1121.214836,-8573.210091,-7402.291878,-7086.641636,-7402.291878,-5966.677194,2790.432892,-5718.976358,-8555.295516,-5479.923422,-5830.38026,-8109.482244,4558.31535,1627.458604,14942.553004,-18081.252053,2844.401106,-1827.068929,294.032347,64415.004827,83113.381803,3902.056107,-3725.865408,223500.0


In [841]:
houses_train.info()
houses_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 288 entries, MSSubClass_lev_x.120 to SalePrice
dtypes: float64(288)
memory usage: 3.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 288 entries, MSSubClass_lev_x.120 to SalePrice
dtypes: float64(287), int64(1)
memory usage: 3.2 MB


Delete first column

In [842]:
# houses_train.drop('Unnamed: 0', axis=1, inplace=True, errors='raise')
# houses_test.drop('Unnamed: 0', axis=1, inplace=True, errors='raise')

# Run xgboost

### Create private training & test set

In [843]:
from sklearn.model_selection import train_test_split

seed = 10
test_ratio = 0.2

X = houses_train.loc[:, houses_train.columns != "SalePrice"].values # convert to np.array
y = houses_train.loc[:, houses_train.columns == "SalePrice"].values

# Take log of SalePrice
y = np.log(y + 1).ravel() # convert to 1D array for model fit (xxx, )


In [844]:
X_pr_train, X_pr_test, y_pr_train, y_pr_test = train_test_split(X, y, test_size=test_ratio, random_state=seed)

In [845]:
print(len(X_pr_train), "train +", len(X_pr_test), "test")

1168 train + 292 test


### Fit Model

In [846]:
from xgboost import XGBRegressor

xgb_clf = XGBRegressor(max_depth=3, 
                        learning_rate=0.05, 
                        n_estimators=1000, # Number of boosted trees to fit
                        silent=False, # print messages while running 
                        objective='reg:linear', 
                        booster='gbtree', # Specify which booster to use: gbtree, gblinear or dart
                        #for dart see http://xgboost.readthedocs.io/en/latest/tutorials/dart.html 
                        n_jobs=-1, # Number of parallel threads used to run xgboost. (replaces nthread)
                        gamma=0,  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                        min_child_weight=1, # Minimum sum of instance weight(hessian) needed in a child
                        max_delta_step=0, # Maximum delta step we allow each tree’s weight estimation to be
                        subsample=1, # Subsample ratio of the training instance
                        colsample_bytree=1, # Subsample ratio of columns when constructing each tree
                        colsample_bylevel=1, # Subsample ratio of columns for each split, in each level
                        reg_alpha=0, # L1 regularization term on weights
                        reg_lambda=1, # L2 regularization term on weights
                        scale_pos_weight=1, # Balancing of positive and negative weights
                        base_score=0.5, # The initial prediction score of all instances, global bias
                        random_state=743, 
                        missing=None) # Value in the data which needs to be present as a missing value. If None, defaults to np.nan



In [847]:
xgb_clf.fit(X_pr_train, y_pr_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=-1, nthread=None, objective='reg:linear', random_state=743,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

In [848]:
# make predictions for test data

y_pr_pred = xgb_clf.predict(X_pr_test)

Evaluate predictions

In [849]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_pr_test, y_pr_pred)
rmse = np.sqrt(mse)
rmse


0.12373572496003182

In [850]:
y_pr_test[1:5]
y_pr_pred[1:5]

array([ 12.1428719 ,  11.8277435 ,  12.01067193,  12.64109979])

array([ 12.22154427,  11.80571461,  11.84539032,  12.69584465], dtype=float32)

Save model to file

In [851]:
from sklearn.externals import joblib # More memory efficient than pickle for large numpy arrays

joblib.dump(xgb_clf, './Models/xgboost_model.pkl') 

['./Models/xgboost_model.pkl']

To load a model:

In [852]:
# xgb_clf_loaded = joblib.load('./Models/xgboost_model.pkl') 

### Model Tuning

In [853]:
from sklearn.model_selection import GridSearchCV

xgb_params = {'max_depth': 3,
              'learning_rate': 0.1, 
              'n_estimators': 100, 
              'objective': 'reg:linear'}

# param_grid = {'max_depth': [3], 
#               'learning_rate': [0.05], 
#               'n_estimators': [2000], # Number of boosted trees to fit
#               'objective': ['reg:linear'], 
#               'booster': ['gbtree'], # Specify which booster to use: gbtree, gblinear or dart
#               'n_jobs': [-1], # Number of parallel threads used to run xgboost. (replaces nthread)
#               'gamma': [0],  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
#               'min_child_weight': [1], # Minimum sum of instance weight(hessian) needed in a child
#               'max_delta_step': [0], # Maximum delta step we allow each tree’s weight estimation to be
#               'subsample': [1], # Subsample ratio of the training instance
#               'colsample_bytree': [0.5], # Subsample ratio of columns when constructing each tree
#               'colsample_bylevel': [0.4], # Subsample ratio of columns for each split, in each level
#               'reg_alpha': [0], # L1 regularization term on weights
#               'reg_lambda': [1], # L2 regularization term on weights
#               'scale_pos_weight': [1], # Balancing of positive and negative weights
#               'base_score': [0.5], # The initial prediction score of all instances, global bias
#               'silent': [True],
#               'random_state': [10]}

param_grid = {'max_depth': [3], 
              'learning_rate': [0.05, 0.07, 0.09], 
              'n_estimators': [1500, 1800, 2000], # Number of boosted trees to fit
              'objective': ['reg:linear'], 
              'booster': ['gbtree'], # Specify which booster to use: gbtree, gblinear or dart
              'n_jobs': [-1], # Number of parallel threads used to run xgboost. (replaces nthread)
              'gamma': [0, 0.5],  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
              'min_child_weight': [1], # Minimum sum of instance weight(hessian) needed in a child
              'max_delta_step': [0], # Maximum delta step we allow each tree’s weight estimation to be
              'subsample': [1], # Subsample ratio of the training instance
              'colsample_bytree': [0.5, 0.6, 0.7], # Subsample ratio of columns when constructing each tree
              'colsample_bylevel': [0.2, 0.3, 0.4], # Subsample ratio of columns for each split, in each level
              'reg_alpha': [0], # L1 regularization term on weights
              'reg_lambda': [1], # L2 regularization term on weights
              'scale_pos_weight': [1], # Balancing of positive and negative weights
              'base_score': [0.5], # The initial prediction score of all instances, global bias
              'silent': [True],
              'random_state': [10]}

optimized_xgb_clf = GridSearchCV(XGBRegressor(**xgb_params), # scikit-learn estimator interface 
                                 param_grid = param_grid, # Dictionary with parameters names (string) as keys
                                 scoring="neg_mean_squared_error", # controls what metric they apply to the estimators evaluated
                                 n_jobs=-1, # If True, the data is assumed to be identically distributed across the folds,
                                 iid=True, 
                                 refit=True, # Refit an estimator using the best found parameters (best_estimator_)
                                 cv=3, #integer, to specify the number of folds in a (Stratified)KFold. None -> default 3-fold cross validation
                                 verbose=10, # the higher, the more messages
                                 pre_dispatch="2*n_jobs", # number of jobs that get dispatched during parallel execution
                                 error_score="raise", 
                                 return_train_score=False) #If False, the cv_results_ attribute will not include training scores

Inspect the grid

In [None]:
optimized_xgb_clf

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3], 'learning_rate': [0.05, 0.07, 0.09], 'n_estimators': [1500, 1800, 2000], 'objective': ['reg:linear'], 'booster': ['gbtree'], 'n_jobs': [-1], 'gamma': [0, 0.5], 'min_child_weight': [1], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [0.5, 0.6, 0.7], 'colsample_bylevel': [0.2, 0.3, 0.4], 'reg_alpha': [0], 'reg_lambda': [1], 'scale_pos_weight': [1], 'base_score': [0.5], 'silent': [True], 'random_state': [10]},
       pre_dispatch='2*n_jobs', refit=True, return_tr

Run grid tuning

In [None]:
optimized_xgb_clf.fit(X_pr_train, y_pr_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV] base_score=0.5, boos

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    7.9s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015882879480013438, total=   4.6s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.01426418179182491, total=   4.5s
[

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.1s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015397253074609015, total=   4.7s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014896910615445118, total=   3.5s


[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   20.5s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014487917429806195, total=   5.0s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014882573504946824, total=   4.9s


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   25.5s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.02612714322820539, total=   3.7s
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.016601781227944486, total=   5.1s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   37.3s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.02612714322820539, total=   4.8s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.02075288379834782, total=   4

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   45.4s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.023272999934123635, total=   4.7s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.021072572662089138, total=  

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   58.0s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.5, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.01987726869683416, total=   4.8s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015968985168971123, total=   4.0s

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.2min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014346245178755703, total=   4.3s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014673124230664615, total=   4.4s


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.4min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015125185992708332, total=   4.8s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.0169127740362413, total=   5.2s


[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 


[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.7min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.02507966059810059, total=   4.0s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020988968856872178, total=   

[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020189125563916617, total=   5.6s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, 

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.0min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.02603826327905968, total=   5.5s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.6, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020189125563916617, total=   5.

[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.013752267206574515, total=   6.1s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014344940450357663, total=   6.5s


[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.4min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.016369492049837786, total=   7.7s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.013777016883146849, total=   7.3s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014650454728789055, total=   7.7s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015600611415723933, total=   7.7

[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  2.8min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.019779085786995873, total=   4.9s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.024816630466459585, total=  

[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.025384513686572227, total=   6.7s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, 

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.2min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020649016330320024, total=   4.6s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.2, colsample_bytree=0.7, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.01997056853198773, total=   

[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.013497981762664436, total=   7.7s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.01538927451485356, total=   8.3s
[

[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3.8min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014146340857155916, total=   6.5s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015523815002504862, total=   6.9s


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.015612516617147743, total=   7.2s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.013912039448150141, total=   7.4

[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.3min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.025257406062620932, total=   7.5s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.02091060848861221, total=   

[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020544278738009736, total=   6.8s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, 

[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:  5.0min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.025314913072055453, total=   6.9s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.5, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020770288041086566, total=   6

[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.014335971563446882, total=   5.9s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.013991828521211653, total=   5.8s


[Parallel(n_jobs=-1)]: Done 234 tasks      | elapsed:  5.6min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.01607022490137085, total=   5.9s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.013486314897466848, total=   5.9s
[

[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020768675540582395, total=   7.0s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.019890639713141055, total=  

[Parallel(n_jobs=-1)]: Done 257 tasks      | elapsed:  6.2min


[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1800, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.020099730604727317, total=   7.2s
[CV] base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0.5, learning_rate=0.09, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=1500, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1 
[CV]  base_score=0.5, booster=gbtree, colsample_bylevel=0.3, colsample_bytree=0.6, gamma=0.5, learning_rate=0.07, max_delta_step=0, max_depth=3, min_child_weight=1, n_estimators=2000, n_jobs=-1, objective=reg:linear, random_state=10, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=1, score=-0.026082645420525526, total=  

The best combination of parameters is:

In [None]:
optimized_xgb_clf.best_params_

In [None]:
## The best score is
optimized_xgb_clf.best_score_ # that's the training score so not meaningful

In [None]:
# optimized_xgb_clf.cv_results_

In [None]:
# make predictions for test data

y_pr_pred = optimized_xgb_clf.predict(X_pr_test)

Evaluate predictions

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_pr_test, y_pr_pred)
rmse = np.sqrt(mse)
rmse


In [None]:
y_pr_test[1:5]
y_pr_pred[1:5]