In [39]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [40]:
path = 'data/Melbourne_housing_FULL.csv'
house_df = pd.read_csv(path)
house_df = house_df.dropna(axis=0, how='any')

In [41]:
print(house_df.columns)

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')


In [42]:
X = house_df[['Rooms', 'Bathroom', 'Car', 'BuildingArea', 'Lattitude', 'Longtitude']]
y = house_df['Price']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [44]:
house_model = RandomForestRegressor(random_state=17)
house_model.fit(X_train, y_train)
preds = house_model.predict(X_test)

In [45]:
print(preds[:10])
print(y_test[:10].values)

[1154950.   2736650.   1729580.    849740.   1291377.76 1264850.
  490278.88  417380.   2419425.    561008.86]
[1064000. 2315000. 2200000.  950000. 1160000.  965000.  360000.  380000.
 1810000.  563000.]


In [46]:
mae = mean_absolute_error(y_true=y_test, y_pred=preds)
print(mae)

197562.29843577213


In [47]:
from sklearn.model_selection import GridSearchCV

In [48]:
param_grid = {
    'n_estimators': [75, 100, 150],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [3, 5, 7],
    'min_samples_split': [5, 10]
}

# Step 1: Compare Different Parameters
You know the best tree size. If you were going to deploy this model in practice, you would make it even more accurate by
using all of the data and keeping that tree size. That is, you don't need to hold out the validation data now that you've made all your modeling decisions.

In [49]:
model_gs = GridSearchCV(
    house_model,
    param_grid,
    cv=5
)
model_gs.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=17),
             param_grid={'max_depth': [10, 20, 30],
                         'min_samples_leaf': [3, 5, 7],
                         'min_samples_split': [5, 10],
                         'n_estimators': [75, 100, 150]})

In [50]:
print('Best parameters for Random Forest Tree:', model_gs.best_params_)
best_n_estimators = model_gs.best_params_['n_estimators']
best_max_depth = model_gs.best_params_['max_depth']
best_min_samples_leaf = model_gs.best_params_['min_samples_leaf']
best_min_samples_split = model_gs.best_params_['min_samples_split']

Best parameters for Random Forest Tree: {'max_depth': 20, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 150}


In [51]:
final_model = RandomForestRegressor(
    n_estimators=75,
    min_samples_leaf=3,
    min_samples_split=8,
    random_state=17
)
final_model.fit(X, y)

RandomForestRegressor(min_samples_leaf=3, min_samples_split=8, n_estimators=75,
                      random_state=17)

In [52]:
y_pred = final_model.predict(X)
final_mae = mean_absolute_error(y_true=y, y_pred=y_pred)
print(
    final_mae
)
print('Difference between tuned and default Random Forest Tree is', mae - final_mae)

121411.13983298182
Difference between tuned and default Random Forest Tree is 76151.15860279031
