In [27]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
df_train  = pd.read_csv("data/TRAIN_SET.csv")

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [30]:
kfold = KFold(5, shuffle = True, random_state = 123)

### Model 5 - XGBoost Regressor using  BEDS, BATHS, SQUARE FEET, LOT SIZE, AGE, LATITUDE, LATITUDE, Bayes_RatingSchool, crime_percentage, Age, zipcode, Property type (5 classes) with default parameters

In [32]:
# define model object 
model5 = xgb.XGBRegressor(random_state=123)
rmse = np.zeros((5,1))

for i, (train_index, test_index) in enumerate(kfold.split(df_train)):
    # get the cross validation train set and holdout set 
    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index]

    #fit the model1 
    model5.fit(df_tt[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']],
               df_tt.log_price.values)
    
    # predict values for holdout set 
    pred5 = model5.predict(df_ho[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']])
    
    # record mse
    rmse[i] = np.sqrt(mean_squared_error(df_ho[['log_price']], pred5))
    i = i + 1

print('mean rmse for XGBoost model including non traditional features is', rmse.mean())

mean rmse for XGBoost model including non traditional features is 0.09910098787660755


## Model Calibration/Parameter Tuning

### Grid search 

In [33]:
def grid_search(params, model):
    grid_reg = GridSearchCV(model, params, scoring = 'neg_root_mean_squared_error', cv = 5, n_jobs = -1)
    grid_reg.fit(X, Y)
    best_params = grid_reg.best_params_
    print("Best Parameters:" , best_params)
    best_score = -grid_reg.best_score_
    print("RMSE on training set with cross validation: {:.5f}".format(best_score))
    

### Parameters tuning for Model 5 - XGBoost using grid search 



In [34]:
params = {'learning_rate':[0.3,0.2,0.1],
          'n_estimators':[100,150],  
          'subsample':[1,0.6,0.5],
          'max_depth':[6,5],
          'min_child_weight':[1,2]}
model5 = xgb.XGBRegressor()
model = model5
    
X = df_train[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']]

Y = df_train.log_price.values

grid_search(params, model)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 150, 'subsample': 0.6}
RMSE on training set with cross validation: 0.09606


After tuning the model we found the best parameters to be the following

* learning_rate=0.1, (default=0.3)
* max_depth=5, (default =6)
* min_child_weight=2, (default=1)
* n_estimators=150, (default=100)
* subsample=0.6, (default=1)


These are not that different from the default values. 

### Model 5 - XGBoost Regressor using  BEDS, BATHS, SQUARE FEET, LOT SIZE, AGE, LATITUDE, LATITUDE, Bayes_RatingSchool, crime_percentage, Age, zipcode, Property type (5 classes) with best parameters

In [35]:
# define model object 
model5 = xgb.XGBRegressor(learning_rate=0.1,
                          max_depth=5,
                          min_child_weight=2,
                          n_estimators=150,
                          subsample=0.6)
rmse = np.zeros((5,1))

for i, (train_index, test_index) in enumerate(kfold.split(df_train)):
    # get the cross validation train set and holdout set 
    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index]

    #fit the model1 
    model5.fit(df_tt[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']],
               df_tt.log_price.values)
    
    # predict values for holdout set 
    pred5 = model5.predict(df_ho[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']])
    
    # record mse
    rmse[i] = np.sqrt(mean_squared_error(df_ho[['log_price']], pred5))
    i = i + 1

print('mean rmse for XGBoost model including non traditional features is', rmse.mean())

mean rmse for XGBoost model including non traditional features is 0.09547820019113182


From the models we used, XGBoost seem to perform well.