# Zillow’s Home Value Prediction
#https://www.kaggle.com/c/zillow-prize-1/overview

Zillow provides a “Zestimate”, which is an estimated property value.

zillow's home value prediction:
                       
                       logerror = log(Zestimate) - log(SalePrice)


#### 1. our task: predict the logerror of zillow with given house features

#### 2. technical goals: 

- dealing with the missing data
- defining an appropriate predicting model



##### Reference: 

* https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e

* https://xgboost.readthedocs.io/en/latest/python/python_api.html

* https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f

In [129]:
# import required packages
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import xgboost as xgb
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
%matplotlib inline

In [210]:
# import dataset
property_df = pd.read_csv('input/properties_2016.csv')
train_df = pd.read_csv('input/train_2016_v2.csv')
submission = pd.read_csv('input/sample_submission.csv')

In [211]:
print('Shape Train data', train_df.shape)
print('Shape Property', property_df.shape)

Shape Train data (90275, 3)
Shape Property (2985217, 58)


# Features engineering

- Target Variable : remove outliers
- Missing data: remove columns with too many missing values
- Imputing missing values and LabelEncoder

In [212]:
# remove the outliers
ulimit = np.percentile(train_df.logerror.values, 99)
llimit = np.percentile(train_df.logerror.values, 1)
train_df = train_df[train_df['logerror'] < ulimit]
train_df = train_df[train_df['logerror'] > llimit]
print(train_df.shape)

missing_df = (property_df.isnull().sum()/property_df.isnull().count()).reset_index()
missing_df.columns = ['column_name','counts']
missing_df = missing_df[missing_df['counts']>0]
missing_df = missing_df.sort_values(by='counts')

# # barplot
# ind = np.arange(missing_df.shape[0])
# width = 0.9
# fig, ax = plt.subplots(figsize=(12,18))
# rects = ax.barh(ind, missing_df.counts, color='blue')
# ax.set_yticks(ind)
# ax.set_yticklabels(missing_df.column_name, rotation='horizontal')
# ax.set_xlabel("Count of missing values")
# ax.set_title("Number of missing values in each column")
# plt.show()

# remove columns
drop_index = missing_df.column_name[missing_df.counts > 0.97]
property_df = property_df.drop(drop_index,axis=1)


# Imputing missing values and LabelEncoder
for c in property_df:
    property_df[c]=property_df[c].fillna(-1)
    if property_df[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(property_df[c].values))
        property_df[c] = lbl.transform(list(property_df[c].values))

(88465, 3)


In [216]:
df_train = train_df.merge(property_df, how='left', on='parcelid')
train_x = df_train.drop(['parcelid', 'logerror','transactiondate'], axis=1)
train_y = df_train['logerror'] 

In [217]:
X_train, X_test, y_train, y_test = train_test_split(train_x,train_y, test_size=0.33, random_state=42)
X_train.shape,X_test.shape

((59271, 40), (29194, 40))

# Mainly method: XGBOOST

In [106]:
# first round

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# params
# According to refrence one, I set eta: 0.01, num_boost_round: 999. I want to find the best boost round at this
# learning rate
# I use MAE(mean absolute error) to evaluate the quality of my predictions

params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':0.01,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'mae'
}

num_boost_round = 999

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

  if getattr(data, 'base', None) is not None and \


[0]	Test-mae:0.485606
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:0.480746
[2]	Test-mae:0.475931
[3]	Test-mae:0.471162
[4]	Test-mae:0.466442
[5]	Test-mae:0.461775
[6]	Test-mae:0.457171
[7]	Test-mae:0.452591
[8]	Test-mae:0.448065
[9]	Test-mae:0.443589
[10]	Test-mae:0.439154
[11]	Test-mae:0.434774
[12]	Test-mae:0.430422
[13]	Test-mae:0.426138
[14]	Test-mae:0.421879
[15]	Test-mae:0.417662
[16]	Test-mae:0.413512
[17]	Test-mae:0.409388
[18]	Test-mae:0.405301
[19]	Test-mae:0.401261
[20]	Test-mae:0.397274
[21]	Test-mae:0.393319
[22]	Test-mae:0.389412
[23]	Test-mae:0.385543
[24]	Test-mae:0.381717
[25]	Test-mae:0.377928
[26]	Test-mae:0.37418
[27]	Test-mae:0.37047
[28]	Test-mae:0.3668
[29]	Test-mae:0.363161
[30]	Test-mae:0.359573
[31]	Test-mae:0.356018
[32]	Test-mae:0.352494
[33]	Test-mae:0.349012
[34]	Test-mae:0.345566
[35]	Test-mae:0.342152
[36]	Test-mae:0.338783
[37]	Test-mae:0.33544
[38]	Test-mae:0.33214
[39]	Test-mae:0.328873
[40]	Test-mae:0.325643
[41]	Test-mae:0.3

[341]	Test-mae:0.056468
[342]	Test-mae:0.056414
[343]	Test-mae:0.056362
[344]	Test-mae:0.056311
[345]	Test-mae:0.05626
[346]	Test-mae:0.056211
[347]	Test-mae:0.056162
[348]	Test-mae:0.056114
[349]	Test-mae:0.056067
[350]	Test-mae:0.056022
[351]	Test-mae:0.055975
[352]	Test-mae:0.05593
[353]	Test-mae:0.055886
[354]	Test-mae:0.055842
[355]	Test-mae:0.0558
[356]	Test-mae:0.055758
[357]	Test-mae:0.055716
[358]	Test-mae:0.055676
[359]	Test-mae:0.055636
[360]	Test-mae:0.055597
[361]	Test-mae:0.055559
[362]	Test-mae:0.055521
[363]	Test-mae:0.055483
[364]	Test-mae:0.055446
[365]	Test-mae:0.055411
[366]	Test-mae:0.055376
[367]	Test-mae:0.055341
[368]	Test-mae:0.055307
[369]	Test-mae:0.055273
[370]	Test-mae:0.055241
[371]	Test-mae:0.055208
[372]	Test-mae:0.055176
[373]	Test-mae:0.055145
[374]	Test-mae:0.055113
[375]	Test-mae:0.055083
[376]	Test-mae:0.055053
[377]	Test-mae:0.055024
[378]	Test-mae:0.054995
[379]	Test-mae:0.054967
[380]	Test-mae:0.054939
[381]	Test-mae:0.054912
[382]	Test-mae:0.054

[684]	Test-mae:0.053172
[685]	Test-mae:0.053172
[686]	Test-mae:0.053172
[687]	Test-mae:0.053172
[688]	Test-mae:0.053172
[689]	Test-mae:0.053172
[690]	Test-mae:0.053172
[691]	Test-mae:0.053172
[692]	Test-mae:0.053171
[693]	Test-mae:0.05317
[694]	Test-mae:0.05317
[695]	Test-mae:0.05317
[696]	Test-mae:0.05317
[697]	Test-mae:0.05317
[698]	Test-mae:0.053169
[699]	Test-mae:0.053169
[700]	Test-mae:0.053168
[701]	Test-mae:0.053168
[702]	Test-mae:0.053167
[703]	Test-mae:0.053167
[704]	Test-mae:0.053166
[705]	Test-mae:0.053166
[706]	Test-mae:0.053166
[707]	Test-mae:0.053166
[708]	Test-mae:0.053166
[709]	Test-mae:0.053165
[710]	Test-mae:0.053165
[711]	Test-mae:0.053165
[712]	Test-mae:0.053165
[713]	Test-mae:0.053165
[714]	Test-mae:0.053164
[715]	Test-mae:0.053164
[716]	Test-mae:0.053164
[717]	Test-mae:0.053164
[718]	Test-mae:0.053164
[719]	Test-mae:0.053164
[720]	Test-mae:0.053164
[721]	Test-mae:0.053164
[722]	Test-mae:0.053164
[723]	Test-mae:0.053164
[724]	Test-mae:0.053163
[725]	Test-mae:0.0531

In [108]:
print("Best MAE: {:.4f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MAE: 0.0532 with 737 rounds


In [109]:
# Here is the cv function from XGBoost. 
# It allows us to run cross-validation on our training dataset and returns a mean MAE score. 

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)

cv_results




Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,0.485761,0.000170,0.485757,0.000686
1,0.480907,0.000169,0.480901,0.000689
2,0.476094,0.000166,0.476092,0.000691
3,0.471335,0.000168,0.471332,0.000696
4,0.466618,0.000165,0.466622,0.000697
5,0.461953,0.000161,0.461956,0.000699
6,0.457340,0.000164,0.457337,0.000705
7,0.452769,0.000156,0.452766,0.000705
8,0.448239,0.000158,0.448242,0.000708
9,0.443762,0.000158,0.443762,0.000713


In [110]:
cv_results['test-mae-mean'].min()

0.052557200000000005

# Tuning
- Mainly method: RandomizedSearchCV 

In [218]:
# 

param_dist = {
        'n_estimators':[737,738,739,740],
        'max_depth':range(3,12),
        'subsample':np.linspace(0.8,1,5),
        'colsample_bytree':np.linspace(0.5,0.98,10),
        'min_child_weight':range(1,9)
        }



xgb = XGBRegressor(learning_rate=0.01)
 

random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, 
                                   n_iter=5, scoring='neg_mean_absolute_error', 
                                   n_jobs=4, cv=5, 
                                   verbose=3, random_state=100)


random_search.fit(X_train,y_train)


In [125]:
random_search.best_params_, random_search.best_estimator_, random_search.best_score_

({'subsample': 1.0,
  'n_estimators': 740,
  'min_child_weight': 6,
  'max_depth': 6,
  'colsample_bytree': 0.9266666666666666},
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.9266666666666666, gamma=0,
        importance_type='gain', learning_rate=0.01, max_delta_step=0,
        max_depth=6, min_child_weight=6, missing=None, n_estimators=740,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=None, subsample=1.0, verbosity=1),
 -0.05254329764833132)

In [142]:
# Here I want to tune the learning_rate

param_eta = {
    'learning_rate':[.05, .01, .005]
} 

xgb2 = XGBRegressor(max_depth=6, 
                   subsample=1, 
                   n_estimators=740,
                   min_child_weight=6,
                   colsample_bytree=0.927)

gs = GridSearchCV(estimator = xgb2, param_grid = param_eta,cv=5,scoring='neg_mean_absolute_error')
gs.fit(X_train, y_train)

In [141]:
gs.best_params_, gs.best_estimator_, gs.best_score_

({'learning_rate': 0.01},
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.927, gamma=0,
        importance_type='gain', learning_rate=0.01, max_delta_step=0,
        max_depth=6, min_child_weight=6, missing=None, n_estimators=740,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=None, subsample=1, verbosity=1),
 -0.05254329764833132)

In [149]:
#

num_boost_round=740
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 6,
    'eta':0.01,
    'subsample': 1,
    'colsample_bytree': 0.927,
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'mae'
}


best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)



[0]	Test-mae:0.485608
[1]	Test-mae:0.480749
[2]	Test-mae:0.475934
[3]	Test-mae:0.471164
[4]	Test-mae:0.46645
[5]	Test-mae:0.461785
[6]	Test-mae:0.457167
[7]	Test-mae:0.452592
[8]	Test-mae:0.448082
[9]	Test-mae:0.443588
[10]	Test-mae:0.43917
[11]	Test-mae:0.434778
[12]	Test-mae:0.43044
[13]	Test-mae:0.426136
[14]	Test-mae:0.421902
[15]	Test-mae:0.417678
[16]	Test-mae:0.413517
[17]	Test-mae:0.409406
[18]	Test-mae:0.405323
[19]	Test-mae:0.401284
[20]	Test-mae:0.397284
[21]	Test-mae:0.393335
[22]	Test-mae:0.389425
[23]	Test-mae:0.385557
[24]	Test-mae:0.381729
[25]	Test-mae:0.377944
[26]	Test-mae:0.374193
[27]	Test-mae:0.370482
[28]	Test-mae:0.36681
[29]	Test-mae:0.363182
[30]	Test-mae:0.359589
[31]	Test-mae:0.356028
[32]	Test-mae:0.352509
[33]	Test-mae:0.349031
[34]	Test-mae:0.345578
[35]	Test-mae:0.342172
[36]	Test-mae:0.338798
[37]	Test-mae:0.33546
[38]	Test-mae:0.332156
[39]	Test-mae:0.328889
[40]	Test-mae:0.325655
[41]	Test-mae:0.322459
[42]	Test-mae:0.319296
[43]	Test-mae:0.316161
[44

[343]	Test-mae:0.056344
[344]	Test-mae:0.056292
[345]	Test-mae:0.056242
[346]	Test-mae:0.056192
[347]	Test-mae:0.056143
[348]	Test-mae:0.056095
[349]	Test-mae:0.056047
[350]	Test-mae:0.056001
[351]	Test-mae:0.055955
[352]	Test-mae:0.05591
[353]	Test-mae:0.055865
[354]	Test-mae:0.055822
[355]	Test-mae:0.055779
[356]	Test-mae:0.055737
[357]	Test-mae:0.055695
[358]	Test-mae:0.055654
[359]	Test-mae:0.055614
[360]	Test-mae:0.055575
[361]	Test-mae:0.055536
[362]	Test-mae:0.055498
[363]	Test-mae:0.055461
[364]	Test-mae:0.055425
[365]	Test-mae:0.055389
[366]	Test-mae:0.055354
[367]	Test-mae:0.055319
[368]	Test-mae:0.055285
[369]	Test-mae:0.055251
[370]	Test-mae:0.055218
[371]	Test-mae:0.055185
[372]	Test-mae:0.055153
[373]	Test-mae:0.055122
[374]	Test-mae:0.055091
[375]	Test-mae:0.05506
[376]	Test-mae:0.05503
[377]	Test-mae:0.055
[378]	Test-mae:0.05497
[379]	Test-mae:0.054942
[380]	Test-mae:0.054914
[381]	Test-mae:0.054886
[382]	Test-mae:0.054858
[383]	Test-mae:0.054831
[384]	Test-mae:0.054805

[687]	Test-mae:0.053143
[688]	Test-mae:0.053143
[689]	Test-mae:0.053143
[690]	Test-mae:0.053143
[691]	Test-mae:0.053143
[692]	Test-mae:0.053143
[693]	Test-mae:0.053142
[694]	Test-mae:0.053142
[695]	Test-mae:0.053141
[696]	Test-mae:0.053141
[697]	Test-mae:0.05314
[698]	Test-mae:0.05314
[699]	Test-mae:0.05314
[700]	Test-mae:0.053139
[701]	Test-mae:0.053139
[702]	Test-mae:0.053139
[703]	Test-mae:0.053139
[704]	Test-mae:0.053138
[705]	Test-mae:0.053138
[706]	Test-mae:0.053138
[707]	Test-mae:0.053138
[708]	Test-mae:0.053137
[709]	Test-mae:0.053137
[710]	Test-mae:0.053136
[711]	Test-mae:0.053136
[712]	Test-mae:0.053136
[713]	Test-mae:0.053136
[714]	Test-mae:0.053136
[715]	Test-mae:0.053135
[716]	Test-mae:0.053135
[717]	Test-mae:0.053134
[718]	Test-mae:0.053134
[719]	Test-mae:0.053133
[720]	Test-mae:0.053133
[721]	Test-mae:0.053133
[722]	Test-mae:0.053132
[723]	Test-mae:0.053132
[724]	Test-mae:0.053132
[725]	Test-mae:0.053132
[726]	Test-mae:0.053131
[727]	Test-mae:0.053132
[728]	Test-mae:0.05

In [153]:
mean_absolute_error(best_model.predict(dtest), y_test)

0.05313128537286346

In [207]:
#submit
select_feature = list(X_train.columns)
test_df = pd.merge(submission[['ParcelId']], property_df.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')
protest = xgb.DMatrix(test_df[select_feature])
y_pred = best_model.predict(protest)


submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})

test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred

submission.to_csv('my_submission2.csv', float_format='%.6f',index=False)

Predicting for: 201610 ... 
Predicting for: 201611 ... 
Predicting for: 201612 ... 
Predicting for: 201710 ... 
Predicting for: 201711 ... 
Predicting for: 201712 ... 
