In [46]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt

In [47]:
df_train  = pd.read_csv("data/TRAIN_SET.csv")

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [49]:
kfold = KFold(5, shuffle = True, random_state = 123)

### Model 4 - Random Forest Regressor using  BEDS, BATHS, SQUARE FEET, LOT SIZE, AGE, LATITUDE, LATITUDE, Bayes_RatingSchool, crime_percentage, Age, zipcode, Property type (5 classes) with default parameters

In [50]:
# define model object 
model4 = RandomForestRegressor(random_state = 123)
rmse = np.zeros((5,1))

for i, (train_index, test_index) in enumerate(kfold.split(df_train)):
    # get the cross validation train set and holdout set 
    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index]

    #fit the model1 
    model4.fit(df_tt[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']],
               df_tt.log_price.values)
    
    # predict values for holdout set 
    pred4 = model4.predict(df_ho[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']])
    
    # record mse
    rmse[i] = np.sqrt(mean_squared_error(df_ho[['log_price']], pred4))
    i = i + 1

print('mean rmse for Random Forest model including non traditional features is', rmse.mean())

mean rmse for Random Forest model including non traditional features is 0.10290433802072982


## Model Calibration/Parameter Tuning

### Grid search 

In [51]:
def grid_search(params, model):
    grid_reg = GridSearchCV(model, params, scoring ='neg_root_mean_squared_error', cv = 5, n_jobs = -1)
    grid_reg.fit(X, Y)
    best_params = grid_reg.best_params_
    print("Best Parameters:" , best_params)
    best_score = -grid_reg.best_score_
    print("MSE on training set with cross validation: {:.5f}".format(best_score))
    

### Parameters tuning for Model 4 - Random Forest Regressor using grid search 



In [52]:
params = {'n_estimators':[100,200],  
                 'max_depth':[None,1],
                 'min_samples_split':[2,3],
                 'min_samples_leaf':[1,2],
                 'min_weight_fraction_leaf':[0.0,0.0025],
                 'max_features':[1,2]}
                 
model4 = RandomForestRegressor(random_state = 123)
model = model4
    
X = df_train[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']]
Y = df_train.log_price.values

grid_search(params, model)

Best Parameters: {'max_depth': None, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200}
MSE on training set with cross validation: 0.10294


After tuning the model we found the best parameters to be the following

* max_depth': None, (default)
* max_features: 2, (default =1)
* min_samples_leaf': 1, (default)
* min_samples_split': 3, (default=2)
* min_weight_fraction_leaf': 0.0, (default)
* n_estimators': 200 (default = 100)

These are not that different from the default values. 

### Model 4 - Random Forest Regressor using  BEDS, BATHS, SQUARE FEET, LOT SIZE, AGE, LATITUDE, LATITUDE, Bayes_RatingSchool, crime_percentage, Age, zipcode, Property type (5 classes) with best parameters

In [53]:
# define model object 
model4 = RandomForestRegressor(max_features=2,min_samples_split=3,n_estimators=200,random_state = 123)
rmse = np.zeros((5,1))

for i, (train_index, test_index) in enumerate(kfold.split(df_train)):
    # get the cross validation train set and holdout set 
    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index]

    #fit the model1 
    model4.fit(df_tt[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']],
               df_tt.log_price.values)
    
    # predict values for holdout set 
    pred4 = model4.predict(df_ho[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']])
    
    # record mse
    rmse[i] = np.sqrt(mean_squared_error(df_ho[['log_price']], pred4))
    i = i + 1

print('mean rmse for Random Forest model including non traditional features is', rmse.mean())

mean rmse for Random Forest model including non traditional features is 0.10108838956664287
