# Zillow’s Home Value Prediction
#https://www.kaggle.com/c/zillow-prize-1/overview

Zillow provides a “Zestimate”, which is an estimated property value.

zillow's home value prediction:
                       
                       logerror = log(Zestimate) - log(SalePrice)


#### 1. our task: predict the logerror of zillow with given house features

#### 2. technical goals: 

- dealing with the missing data
- defining an appropriate predicting model



##### Reference: 

* https://towardsdatascience.com/fine-tuning-xgboost-in-python-like-a-boss-b4543ed8b1e

* https://xgboost.readthedocs.io/en/latest/python/python_api.html

* https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f

In [129]:
# import required packages
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import xgboost as xgb
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
%matplotlib inline

In [210]:
# import dataset
property_df = pd.read_csv('input/properties_2016.csv')
train_df = pd.read_csv('input/train_2016_v2.csv')
submission = pd.read_csv('input/sample_submission.csv')

In [211]:
print('Shape Train data', train_df.shape)
print('Shape Property', property_df.shape)

Shape Train data (90275, 3)
Shape Property (2985217, 58)


# Features engineering

- Target Variable : remove outliers
- Missing data: remove columns with too many missing values
- Imputing missing values and LabelEncoder

In [212]:
# remove the outliers
ulimit = np.percentile(train_df.logerror.values, 99)
llimit = np.percentile(train_df.logerror.values, 1)
train_df = train_df[train_df['logerror'] < ulimit]
train_df = train_df[train_df['logerror'] > llimit]
print(train_df.shape)

missing_df = (property_df.isnull().sum()/property_df.isnull().count()).reset_index()
missing_df.columns = ['column_name','counts']
missing_df = missing_df[missing_df['counts']>0]
missing_df = missing_df.sort_values(by='counts')

# # barplot
# ind = np.arange(missing_df.shape[0])
# width = 0.9
# fig, ax = plt.subplots(figsize=(12,18))
# rects = ax.barh(ind, missing_df.counts, color='blue')
# ax.set_yticks(ind)
# ax.set_yticklabels(missing_df.column_name, rotation='horizontal')
# ax.set_xlabel("Count of missing values")
# ax.set_title("Number of missing values in each column")
# plt.show()

# remove columns
drop_index = missing_df.column_name[missing_df.counts > 0.97]
property_df = property_df.drop(drop_index,axis=1)


# Imputing missing values and LabelEncoder
for c in property_df:
    property_df[c]=property_df[c].fillna(-1)
    if property_df[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(property_df[c].values))
        property_df[c] = lbl.transform(list(property_df[c].values))

(88465, 3)


In [216]:
df_train = train_df.merge(property_df, how='left', on='parcelid')
train_x = df_train.drop(['parcelid', 'logerror','transactiondate'], axis=1)
train_y = df_train['logerror'] 

In [217]:
X_train, X_test, y_train, y_test = train_test_split(train_x,train_y, test_size=0.33, random_state=42)
X_train.shape,X_test.shape

((59271, 40), (29194, 40))

# Mainly method: XGBOOST

In [1]:
# first round

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# params
# According to refrence one, I set eta: 0.01, num_boost_round: 999. I want to find the best boost round at this
# learning rate
# I use MAE(mean absolute error) to evaluate the quality of my predictions

params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':0.01,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'mae'
}

num_boost_round = 999

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [108]:
print("Best MAE: {:.4f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MAE: 0.0532 with 737 rounds


In [2]:
# Here is the cv function from XGBoost. 
# It allows us to run cross-validation on our training dataset and returns a mean MAE score. 

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)

cv_results


In [110]:
cv_results['test-mae-mean'].min()

0.052557200000000005

# Tuning
- Mainly method: RandomizedSearchCV 

In [218]:
# 

param_dist = {
        'n_estimators':[737,738,739,740],
        'max_depth':range(3,12),
        'subsample':np.linspace(0.8,1,5),
        'colsample_bytree':np.linspace(0.5,0.98,10),
        'min_child_weight':range(1,9)
        }



xgb = XGBRegressor(learning_rate=0.01)
 

random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, 
                                   n_iter=5, scoring='neg_mean_absolute_error', 
                                   n_jobs=4, cv=5, 
                                   verbose=3, random_state=100)


random_search.fit(X_train,y_train)


In [125]:
random_search.best_params_, random_search.best_estimator_, random_search.best_score_

({'subsample': 1.0,
  'n_estimators': 740,
  'min_child_weight': 6,
  'max_depth': 6,
  'colsample_bytree': 0.9266666666666666},
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.9266666666666666, gamma=0,
        importance_type='gain', learning_rate=0.01, max_delta_step=0,
        max_depth=6, min_child_weight=6, missing=None, n_estimators=740,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=None, subsample=1.0, verbosity=1),
 -0.05254329764833132)

In [142]:
# Here I want to tune the learning_rate

param_eta = {
    'learning_rate':[.05, .01, .005]
} 

xgb2 = XGBRegressor(max_depth=6, 
                   subsample=1, 
                   n_estimators=740,
                   min_child_weight=6,
                   colsample_bytree=0.927)

gs = GridSearchCV(estimator = xgb2, param_grid = param_eta,cv=5,scoring='neg_mean_absolute_error')
gs.fit(X_train, y_train)

In [141]:
gs.best_params_, gs.best_estimator_, gs.best_score_

({'learning_rate': 0.01},
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bynode=1, colsample_bytree=0.927, gamma=0,
        importance_type='gain', learning_rate=0.01, max_delta_step=0,
        max_depth=6, min_child_weight=6, missing=None, n_estimators=740,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=None, subsample=1, verbosity=1),
 -0.05254329764833132)

In [3]:
#

num_boost_round=740
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 6,
    'eta':0.01,
    'subsample': 1,
    'colsample_bytree': 0.927,
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'mae'
}


best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)



In [153]:
mean_absolute_error(best_model.predict(dtest), y_test)

0.05313128537286346

In [207]:
#submit
select_feature = list(X_train.columns)
test_df = pd.merge(submission[['ParcelId']], property_df.rename(columns = {'parcelid': 'ParcelId'}), how = 'left', on = 'ParcelId')
protest = xgb.DMatrix(test_df[select_feature])
y_pred = best_model.predict(protest)


submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})

test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
    '201711': pd.Timestamp('2017-10-31'),
    '201712': pd.Timestamp('2017-11-30')
}
for label, test_date in test_dates.items():
    print("Predicting for: %s ... " % (label))
    submission[label] = y_pred

submission.to_csv('my_submission2.csv', float_format='%.6f',index=False)

Predicting for: 201610 ... 
Predicting for: 201611 ... 
Predicting for: 201612 ... 
Predicting for: 201710 ... 
Predicting for: 201711 ... 
Predicting for: 201712 ... 
