In [14]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
df_train  = pd.read_csv("data/TRAIN_SET.csv")

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [20]:
kfold = KFold(5, shuffle = True, random_state = 123)

### Model 3 - Decision Tree Regressor using  BEDS, BATHS, SQUARE FEET, LOT SIZE, AGE, LATITUDE, LATITUDE, Bayes_RatingSchool, crime_percentage, Age, zipcode, Property type (5 classes) with default parameters

In [22]:
# define model object 
model3 = DecisionTreeRegressor(random_state=123)
rmse = np.zeros((5,1))

for i, (train_index, test_index) in enumerate(kfold.split(df_train)):
    # get the cross validation train set and holdout set 
    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index]

    #fit the model1 
    model3.fit(df_tt[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']],
               df_tt[['log_price']])
    
    # predict values for holdout set 
    pred3 = model3.predict(df_ho[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']])
    
    # record mse
    rmse[i] = np.sqrt(mean_squared_error(df_ho[['log_price']], pred3))
    i = i + 1

print('mean rmse for Decision Tree model including traditional features is', rmse.mean())

mean rmse for Decision Tree model including traditional features is 0.14405922910164803


## Model Calibration/Parameter Tuning

### Grid search 

In [23]:
def grid_search(params, model):
    grid_reg = GridSearchCV(model, params, scoring = 'neg_root_mean_squared_error', cv = 5, n_jobs = -1)
    grid_reg.fit(X, Y)
    best_params = grid_reg.best_params_
    print("Best Parameters:" , best_params)
    best_score = -grid_reg.best_score_
    print("MSE on training set with cross validation: {:.5f}".format(best_score))
    

### Parameters tuning for Model 3 - Decision Tree using grid search 

Here we wish to look at the parameters max_depth, min_samples_leaf.

In [24]:
params = {'max_depth':[None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
          'min_samples_leaf':[1, 2, 3, 4, 5],
         'min_samples_split':[2,3],
         'min_samples_leaf':[1,2],
         'min_weight_fraction_leaf':[0.0,0.0025, 0.05],
         'max_features':[None,1,2]}

model3 = DecisionTreeRegressor(random_state = 123)
model = model3
    
X = df_train[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']]
Y = df_train[['log_price']]

grid_search(params, model)

Best Parameters: {'max_depth': 8, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0025}
MSE on training set with cross validation: 0.12871


After tuning the model we found the best parameters to be the following

* max_depth=8 (default=None)
* max_leaf_nodes: None, (default)
* min_samples_leaf': 1, (default)
* min_samples_split': 2, (default)
* min_weight_fraction_leaf': 0.0025, (default=0)
* max_features = None(default)


These are not that different from the default values. 

### Model 3 - Decision Tree Regressor using  BEDS, BATHS, SQUARE FEET, LOT SIZE, AGE, LATITUDE, LATITUDE, Bayes_RatingSchool, crime_percentage, Age, zipcode, Property type (5 classes) with best parameters

In [25]:
# define model object 
model3 = DecisionTreeRegressor(max_depth=8,
                              min_weight_fraction_leaf=0.0025,
                              random_state = 123)
rmse = np.zeros((5,1))

for i, (train_index, test_index) in enumerate(kfold.split(df_train)):
    # get the cross validation train set and holdout set 
    df_tt = df_train.iloc[train_index]
    df_ho = df_train.iloc[test_index]

    #fit the model1 
    model3.fit(df_tt[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']],
               df_tt[['log_price']])
    
    # predict values for holdout set 
    pred3 = model3.predict(df_ho[['BEDS','BATHS', 'SQUARE FEET', 'LOT SIZE',  'zipcode', 'LATITUDE', 'LONGITUDE',
                      'Bayes_RatingSchool','crime_percentage', 'Age', 'Single Family', 'Townhouse', 'Condo', 'Multi_Family4', 'Multi_Family5']])
    
    # record mse
    rmse[i] = np.sqrt(mean_squared_error(df_ho[['log_price']], pred3))
    i = i + 1

print('mean rmse for Decision Tree model including traditional features is', rmse.mean())

mean rmse for Decision Tree model including traditional features is 0.12776247525925472
