In [1]:
# Tune the Random Forest model.

# Resurrect the three models.
#import joblib
#Lmodel = joblib.load("housing_linear_regression_model.pkl")
#Dmodel = joblib.load("housing_decision_tree_model.pkl")
#Rmodel = joblib.load("housing_random_forest_model.pkl")

# First, resurrect the data.
import pandas as pd
import numpy as np
datapath="/Users/jasonmiller/Source/MachineLearning/datasets/housing/housing.csv"
all_data=pd.read_csv(datapath)
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(all_data,test_size=0.2,random_state=42)
train_predictors = train_set.drop(["median_house_value"],axis=1)
train_labels = train_set["median_house_value"].copy()
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6  # hard coded index
class AddFeatures (BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self   # required by base class
    def transform(self,X,y=None):
        rooms_per_household = X[:,rooms_ix]/X[:,households_ix]
        population_per_household = X[:,population_ix]/X[:,households_ix]
        bedrooms_per_room = X[:,bedrooms_ix]/X[:,rooms_ix]
        # numpy shorthand for a column-wise concatenation
        return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
numeric_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('feater_adder',AddFeatures()),
    ('scaler',StandardScaler())
])
categoric_features=['ocean_proximity']
numeric_features = list(train_predictors)
numeric_features.remove(categoric_features[0])
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
full_pipeline = ColumnTransformer([
    ("num", numeric_pipeline, numeric_features),  # dense matrix
    ("cat", OneHotEncoder(), categoric_features)  # sparse matrix
])
#
prepared_train_predictors = full_pipeline.fit_transform(train_predictors)

In [2]:
# Use tool for testing parameters.
from sklearn.model_selection import GridSearchCV
# Explore hyper parameters for Random Forest
from sklearn.ensemble import RandomForestRegressor
Rmodel = RandomForestRegressor()
parameter_grid = [
    {'bootstrap':[True],  'n_estimators':[3,10,30], 'max_features':[2,4,6,8]},
    {'bootstrap':[False], 'n_estimators':[3,10,30], 'max_features':[2,4,6,8]}
]
# Select cv=5 for five-way cross-validation.
grid_search = GridSearchCV(Rmodel,
                          parameter_grid,
                          cv=5, 
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit(prepared_train_predictors,train_labels)
# This takes a long time (3 minutes?) at 99% CPU.

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'bootstrap': [True], 'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [3]:
# This gives the score for each parameter combination.
grid_search.cv_results_
# These desribes the best so far.
grid_search.best_estimator_ 
grid_search.best_params_

{'bootstrap': False, 'max_features': 6, 'n_estimators': 30}

In [4]:
parameter_grid_2 = [
    {'bootstrap':[False], 'n_estimators':[30,50,100], 'max_features':[4,6,8]}
]
grid_search_2 = GridSearchCV(Rmodel,
                          parameter_grid_2,
                          cv=5,
                          scoring='neg_mean_squared_error',
                          return_train_score=True)
grid_search_2.fit(prepared_train_predictors,train_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'bootstrap': [False], 'max_features': [4, 6, 8],
                          'n_estimators': [30, 50, 100]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [6]:
# The book has a script to extract rmse from this rather large data structure.
# Our second search did not improve on the first.
grid_search_2.cv_results_
grid_search.best_params_

{'bootstrap': False, 'max_features': 6, 'n_estimators': 30}

In [8]:
# This tool tests random parameter combinations instead of all of them.
# It can sample from given features according to given distributions, not just randomly.
# We will try it with default parameters.
from sklearn.model_selection import RandomizedSearchCV
grid_search_3 = RandomizedSearchCV(Rmodel,list(prepared_train_predictors))
grid_search_2.fit(prepared_train_predictors,train_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'bootstrap': [False], 'max_features': [4, 6, 8],
                          'n_estimators': [30, 50, 100]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [9]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 6, 'n_estimators': 30}

In [10]:
# The random search also did not improve.
# Let's go with the first one.
type(grid_search.best_estimator_)

sklearn.ensemble._forest.RandomForestRegressor

In [13]:
# Extract the random forest with best hyper parameters.
best_model = grid_search.best_estimator_
# Save it.
import joblib
joblib.dump(Rmodel,"housing_tuned_random_forest_model.pkl")
# Ask the model for relative feature importance.
# Book gives a convoluted way to figure out which is which, unfortunately.
# Our #1 feature (median_income) has importance 33.86%.
best_model.feature_importances_

array([7.62768520e-02, 6.82048196e-02, 4.33951215e-02, 1.70349677e-02,
       1.57122728e-02, 1.65873394e-02, 1.56688070e-02, 3.38621010e-01,
       5.67202726e-02, 1.04354580e-01, 7.93161886e-02, 1.25611692e-02,
       1.49445593e-01, 1.89113773e-04, 2.32228739e-03, 3.58960563e-03])

In [14]:
# Finally, if we are done tuning, run the model on the test data.
# This has to be the final step! There is no more data to test on.
test_predictors = test_set.drop(["median_house_value"],axis=1)
test_labels = test_set["median_house_value"].copy()
# Model should be read only but it is not. Avoid retraining it.
prepared_test_predictors = full_pipeline.transform(test_predictors) # do not call fit_transform() !!!
final_predictions = best_model.predict(prepared_test_predictors)
from sklearn.metrics import mean_squared_error
final_mse = mean_squared_error(test_labels,final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

48579.92482709519

In [15]:
# Model is off by under $50K. Better than before.
# Report confidence interval for this estimate.
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - test_labels) ** 2
np.sqrt(stats.t.interval(confidence,
                        len(squared_errors)-1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))

array([46394.14306684, 50671.50759062])

In [None]:
# 95% confidence interval is $46.4K to $50.7K.