# Comparing 3 ML Models 

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

## Load in the Data and understand the features and target variable

In [7]:
housing = fetch_california_housing()

housing.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 20640\n\n    :Number of Attributes: 8 numeric, predictive attributes and the target\n\n    :Attribute Information:\n        - MedInc        median income in block group\n        - HouseAge      median house age in block group\n        - AveRooms      average number of rooms per household\n        - AveBedrms     average number of bedrooms per household\n        - Population    block group population\n        - AveOccup      average number of household members\n        - Latitude      block group latitude\n        - Longitude     block group longitude\n\n    :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000

In [16]:
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [17]:
# House Prices
y = housing.target

# The 8 features
X = housing.data

housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

## Train and Test the models

In [18]:
# Keep the random state at 1 so each group has the same split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

As a starting point, there are instantiations of the 3 classes of
algorithms with predetermined hyperparameters. Don't worry about those for now, 
each group will get to tune their parameters. 

In [19]:
# Group 1
rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1,
                           random_state=1)

# Group 2
gdbr = GradientBoostingRegressor(learning_rate=0.1,
                                 loss='squared_error',
                                 n_estimators=100,
                                 random_state=1)

#Group 3
abr = AdaBoostRegressor(DecisionTreeRegressor(),
                        learning_rate=0.1,
                        loss='linear',
                        n_estimators=100,
                        random_state=1)

In [20]:
# Compare the MSE and R2 scores for the models 

def CV_trees(models, X_train, y_train, X_test, y_test): 
    
    for model in models:
        model_name = type(model).__name__
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test,y_pred)
        R2 = r2_score(y_test, y_pred)
        print(
        '{}         Train CV | MSE: {:2f} | R2: {:2f}'.format(model_name, mse, R2)
        )

models = [rf, gdbr, abr]
CV_trees(models, X_train, y_train, X_test, y_test)

RandomForestRegressor         Train CV | MSE: 0.254154 | R2: 0.806240
GradientBoostingRegressor         Train CV | MSE: 0.289847 | R2: 0.779028
AdaBoostRegressor         Train CV | MSE: 0.257165 | R2: 0.803944


Which of the models cross validates the best? How else can we compare the models against each other?

## Grid Search Outline

In [21]:
#This is an example for a rf grid. For GradientBoost and AdaBoost you will need to update the parameters.
regressor_grid = {'max_depth': [3, None],
                  'max_features': ['sqrt', 'log2', None],
                  'min_samples_split': [2, 4],
                  'min_samples_leaf': [1, 2, 4],
                  'bootstrap': [True, False],
                  'n_estimators': [10, 20, 40, 80],
                  'random_state': [1]
                 }

regressor = RandomForestRegressor() # update if using GradientBoost or AdaBoost
regressor_gridsearch = GridSearchCV(regressor,
                             regressor_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='neg_mean_squared_error')

regressor_gridsearch.fit(X_train, y_train)

print(f"Best parameters: {regressor_gridsearch.best_params_}")

best_model = regressor_gridsearch.best_estimator_

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
R2 = r2_score(y_test, y_pred)
print("New Results - MSE: {:2f} | R2: {:2f}".format(mse, R2))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best parameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 80, 'random_state': 1}
New Results - MSE: 0.230536 | R2: 0.824245


How did your model improve? 
What are the best parameters? 
What are the most import features? 
What other data might you include to improve your performance?
How often should you train your model if used in a production setting?