In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
houseprice = pd.read_csv('../03监督和非监督学习/data/house_prices/house_prices.csv')

x = houseprice[['OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GrLivArea']]
y = houseprice.SalePrice

from sklearn import model_selection
train_x, test_x, train_y, test_y = model_selection.train_test_split(x, y, test_size = 0.3, random_state = 123456)

# Prepare data

In [3]:
dtrain = xgb.DMatrix(train_x, label = train_y)
dtest = xgb.DMatrix(test_x, label = test_y)

# Parameters

In [4]:
params = {
    'booster': 'gbtree',
    'objective': 'reg:linear', # Regression task
    'early_stopping_rounds': 50,
    'eval_metric': 'rmse', # Evaluation metrics also need to be changed
    'gamma': 0,
    'max_depth': 5,
    'subsample': 0.6,
    'colsample_bytree': 0.9,
    'min_child_weight': 1,
    'eta': 0.02,
    'seed': 123456,
    'nthread': 3,
    'silent': 0,
}

# Xgboost training model

In [5]:
watchlist = [(dtest, 'val'), (dtrain, 'train')]
model = xgb.train(params, dtrain, num_boost_round = 300, evals = watchlist, early_stopping_rounds = 10)

[0]	val-rmse:81447.81663	train-rmse:76831.29118
[1]	val-rmse:80314.22269	train-rmse:75746.86402
[2]	val-rmse:79180.18835	train-rmse:74603.65221
[3]	val-rmse:78156.31163	train-rmse:73593.35021
[4]	val-rmse:77054.01792	train-rmse:72475.42381
[5]	val-rmse:76189.02108	train-rmse:71438.10339
[6]	val-rmse:75358.51404	train-rmse:70467.78155
[7]	val-rmse:74510.54773	train-rmse:69484.02582
[8]	val-rmse:73634.76002	train-rmse:68500.19583
[9]	val-rmse:72703.02596	train-rmse:67521.38540
[10]	val-rmse:71775.08741	train-rmse:66528.97635
[11]	val-rmse:70856.07190	train-rmse:65552.43849
[12]	val-rmse:69915.56802	train-rmse:64617.60485
[13]	val-rmse:69063.94290	train-rmse:63716.40549
[14]	val-rmse:68137.43196	train-rmse:62753.71054
[15]	val-rmse:67324.07188	train-rmse:61914.05117
[16]	val-rmse:66488.33471	train-rmse:61078.04519
[17]	val-rmse:65716.61097	train-rmse:60299.26139
[18]	val-rmse:64943.48890	train-rmse:59464.66703
[19]	val-rmse:64074.80003	train-rmse:58598.47516
[20]	val-rmse:63407.30722	trai

  self.starting_round = model.num_boosted_rounds()
Parameters: { "early_stopping_rounds", "silent" } are not used.

  self.starting_round = model.num_boosted_rounds()


[151]	val-rmse:36474.93861	train-rmse:23307.31781
[152]	val-rmse:36462.03412	train-rmse:23255.76860
[153]	val-rmse:36446.61470	train-rmse:23215.78773
[154]	val-rmse:36387.85098	train-rmse:23154.03412
[155]	val-rmse:36343.10143	train-rmse:23107.90729
[156]	val-rmse:36345.00733	train-rmse:23064.66093
[157]	val-rmse:36287.61656	train-rmse:23017.07754
[158]	val-rmse:36322.41797	train-rmse:22968.99535
[159]	val-rmse:36337.26119	train-rmse:22933.05468
[160]	val-rmse:36379.76639	train-rmse:22879.51521
[161]	val-rmse:36436.16838	train-rmse:22821.05963
[162]	val-rmse:36431.81707	train-rmse:22780.36915
[163]	val-rmse:36478.72922	train-rmse:22742.07645
[164]	val-rmse:36419.23548	train-rmse:22681.98596
[165]	val-rmse:36358.03345	train-rmse:22627.17962
[166]	val-rmse:36358.07298	train-rmse:22570.01788
[167]	val-rmse:36315.52835	train-rmse:22535.82486
[168]	val-rmse:36308.53598	train-rmse:22481.12137
[169]	val-rmse:36281.49889	train-rmse:22423.66985
[170]	val-rmse:36287.51642	train-rmse:22377.10423


# Prediction

In [6]:
predict_y = model.predict(dtest, iteration_range = (0, model.best_iteration))
predict_y[:50]

array([186782.62 , 158308.03 , 189887.53 , 137593.61 , 168157.38 ,
       127193.25 , 115729.   , 129018.44 , 153099.4  , 115903.75 ,
       116238.734, 149188.84 , 103349.73 , 191476.7  , 191140.27 ,
       117256.85 , 167023.05 , 218125.36 , 161182.69 , 107676.13 ,
       124426.78 , 103894.086, 143802.73 , 140620.5  , 129017.234,
       300032.38 , 180057.8  , 178509.95 , 118451.516, 210680.97 ,
       177971.89 , 194407.17 , 172823.47 , 173725.98 , 296378.03 ,
       377237.5  , 168470.   , 196021.16 , 162195.19 , 182415.22 ,
       306426.12 , 187732.02 ,  87891.78 , 138883.36 , 175069.78 ,
       333156.88 , 134437.94 , 120888.   , 153827.42 , 162263.11 ],
      dtype=float32)

# Evaluating model performance: R-squared coefficient of determination

In [7]:
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# 解释方差
explained_variance_score(test_y, predict_y)

# 绝对平均误差
mean_absolute_error(test_y, predict_y)

# 均方误差
mean_squared_error(test_y, predict_y)

# 决定系数R2
r2_score(test_y, predict_y)

0.8159574270248413

21178.849609375

1256134016.0

0.8157873153686523