# Jupyter notebook - Random Forest for Regression

We do an exercise to estimate the med value of Boston Housee using Random Forest.

We use GridSearchCV to perform cross validation

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns 
from scipy import stats # I might use this
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor 
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,confusion_matrix, classification_report
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../Dataset/BostonHouse.csv')
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


Data description:

1. crim: per capita rate by area
2. zn: proportion of residential land zoned for lots over 25,000 sq.ft.
3. indus: proportion of non-retail business acres per town.
4. chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
5. nox: nitrogen oxides concentration (parts per 10 million).
6. rm: average number of rooms per dwelling
7. age: proportion of owner-occupied units built prior to 1940.
8. dis: weighted mean of distances to five Boston employment centres. 
9. rad: index of accessibility to radial highways.
10. tax: full-value property-tax rate per 10,000 dollars.
11. ptratio: pupil-teacher ratio by town
12. b: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
13. lstat: lower status of the population (percent)
14. medv: median value of owner-occupied homes in 1000s dollars.


In [3]:
# Divide X and y
X = data.drop('medv', axis = 1)
y = data['medv']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.8, random_state = 0)

# Using GridSearchCV

In [25]:
rf = RandomForestRegressor(random_state = 10)
parameters = {'n_estimators':range(60,66,6)}

clf = GridSearchCV(rf, parameters, cv = 10, iid = True,
                  scoring = "neg_mean_squared_error")

In [26]:
clf.fit(X_train, y_train)
print(clf.best_estimator_)
pred = clf.predict(X_test)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=60, n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False)


In [27]:
print("MSE insample with %s trees: " %clf.best_estimator_.n_estimators,  
      round(-1*(clf.best_score_), 5))

print("MSE out of sample with %s trees: " %clf.best_estimator_.n_estimators,  
      round(mean_squared_error(pred, y_test),5))

MSE insample with 60 trees:  16.94824
MSE out of sample with 60 trees:  25.84132


### Do you remeber the error with de decision Tree??
### Was it higer or lower??

# We can constrain the complexity of random forest

In [9]:
rf = RandomForestRegressor(random_state = 10)
parameters = {'n_estimators':[2,5,10,15,20,30,50,70,80,100, 200],
             'max_depth':range(1,10)}

clf = GridSearchCV(rf, parameters, cv = 10, iid = True,
                  scoring = "neg_mean_squared_error")

clf.fit(X_train, y_train)
print(clf.best_estimator_)
pred = clf.predict(X_test)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False)


In [10]:
print("MSE out of sample with %s trees and max depth %s: " %(clf.best_estimator_.n_estimators,
                                                   clf.best_estimator_.max_depth),  
      round(mean_squared_error(pred, y_test),5))

MSE out of sample with 20 trees and max depth 8:  28.1793


In [15]:
# Let's contrain the depth 
rf = RandomForestRegressor(max_depth = 3, random_state = 10)
parameters = {'n_estimators':[2,5,10,15,20,30,50,70,80,100,200]}

clf = GridSearchCV(rf, parameters, cv = 10, iid = True,
                  scoring = "neg_mean_squared_error")

clf.fit(X_train, y_train)
print(clf.best_estimator_)
pred = clf.predict(X_test)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=10, verbose=0, warm_start=False)


In [16]:
print("MSE out of sample with %s trees and max depth %s: " %(clf.best_estimator_.n_estimators,
                                                   3),  
      round(mean_squared_error(pred, y_test),5))

MSE out of sample with 50 trees and max depth 3:  27.95089
