# Comparing 3 ML Models 

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

## Load in the Data and understand the features and target variable

In [None]:
housing = fetch_california_housing()

housing.DESCR

In [None]:
# House Prices
y = housing.target

# The 8 features
X = housing.data

housing.feature_names

## Train and Test the models

In [None]:
# Keep the random state at 1 so each group has the same split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

As a starting point, there are instantiations of the 3 classes of
algorithms with predetermined hyperparameters. Don't worry about those for now, 
each group will get to tune their parameters. 

In [None]:
# Group 1
rf = RandomForestRegressor(n_estimators=100,
                           n_jobs=-1,
                           random_state=1)

# Group 2
gdbr = GradientBoostingRegressor(learning_rate=0.1,
                                 loss='squared_error',
                                 n_estimators=100,
                                 random_state=1)

#Group 3
abr = AdaBoostRegressor(DecisionTreeRegressor(),
                        learning_rate=0.1,
                        loss='linear',
                        n_estimators=100,
                        random_state=1)

In [None]:
# Compare the MSE and R2 scores for the models 

def CV_trees(models, X_train, y_train, X_test, y_test): 
    
    for model in models:
        model_name = type(model).__name__
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test,y_pred)
        R2 = r2_score(y_test, y_pred)
        print(
        '{}         Train CV | MSE: {:2f} | R2: {:2f}'.format(model_name, mse, R2)
        )

models = [rf, gdbr, abr]
CV_trees(models, X_train, y_train, X_test, y_test)

Which of the models cross validates the best? Why is it inappropriate
to make a judgement on the performance of the models
based only on the evidence we have thus far?

## Grid Search Outline

In [None]:
#This is an example for a rf grid. For GradientBoost and AdaBoost you will need to update the parameters.
regressor_grid = {'max_depth': [3, None],
                  'max_features': ['sqrt', 'log2', None],
                  'min_samples_split': [2, 4],
                  'min_samples_leaf': [1, 2, 4],
                  'bootstrap': [True, False],
                  'n_estimators': [10, 20, 40, 80],
                  'random_state': [1]
                 }

regressor = RandomForestRegressor() # update if using GradientBoost or AdaBoost
regressor_gridsearch = GridSearchCV(regressor,
                             regressor_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='neg_mean_squared_error')

regressor_gridsearch.fit(X_train, y_train)

print(f"Best parameters: {regressor_gridsearch.best_params_}")

best_model = regressor_gridsearch.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
R2 = r2_score(y_test, y_pred)
print("New Results - MSE: {:2f} | R2: {:2f}".format(mse, R2))