## Random Forest- regression

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

In [3]:
from sklearn import datasets ## imports datasets from scikit-learn

In [4]:
data = datasets.load_boston() ## loads Boston dataset from datasets library 

In [5]:
# define the data/predictors as the pre-set feature names  
X = pd.DataFrame(data.data, columns=data.feature_names)

In [6]:
X.head(6)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21


In [7]:
# Put the target (housing value -- MEDV) in another DataFrame
Y = pd.DataFrame(data.target, columns=["MEDV"])

In [8]:
Y.head(5) #continuous numerical variable

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [9]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .2, random_state=25) #20% hold out for testing

### Random forest regression

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [11]:
regr = RandomForestRegressor(max_depth=2, random_state=0)

In [12]:
regr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [13]:
print(regr.feature_importances_)

[ 0.          0.          0.          0.          0.00729892  0.70021159
  0.          0.          0.          0.          0.          0.
  0.29248949]


In [14]:
importances = regr.feature_importances_  #RF based predictor variable importance

In [15]:
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

1. feature 5 (0.700212)
2. feature 12 (0.292489)
3. feature 4 (0.007299)
4. feature 11 (0.000000)
5. feature 10 (0.000000)
6. feature 9 (0.000000)
7. feature 8 (0.000000)
8. feature 7 (0.000000)
9. feature 6 (0.000000)
10. feature 3 (0.000000)
11. feature 2 (0.000000)
12. feature 1 (0.000000)
13. feature 0 (0.000000)


In [16]:
X.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [17]:
Y_pred = regr.predict(X_test)

In [18]:
Y_pred

array([ 15.36462224,  15.36462224,  23.01660873,  14.54563994,
        16.25055599,  23.01660873,  24.08692209,  14.54563994,
        14.54563994,  41.65039063,  14.54563994,  14.54563994,
        17.13054181,  23.01660873,  25.0774782 ,  23.01660873,
        23.01660873,  14.54563994,  14.54563994,  23.01660873,
        23.01660873,  14.54563994,  23.01660873,  31.98945675,
        23.01660873,  23.01660873,  45.0534172 ,  23.01660873,
        23.01660873,  23.01660873,  23.01660873,  24.08692209,
        23.01660873,  23.01660873,  14.54563994,  23.01660873,
        23.01660873,  23.01660873,  30.06063136,  23.01660873,
        23.01660873,  23.01660873,  14.54563994,  16.25055599,
        14.54563994,  23.01660873,  14.54563994,  23.01660873,
        31.98945675,  14.54563994,  14.54563994,  15.36462224,
        25.9094525 ,  23.01660873,  23.01660873,  14.54563994,
        31.98945675,  24.08692209,  14.54563994,  24.08692209,
        14.54563994,  22.1699973 ,  23.01660873,  23.01

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
mean_squared_error(y_test, Y_pred)

33.033671441782367

In [21]:
from sklearn.metrics import r2_score

In [22]:
r2_score(y_test, Y_pred)  

0.5156731310597823

In [23]:
# define the data/predictors as the pre-set feature names  
X = pd.DataFrame(data.data, columns=data.feature_names)

In [24]:
X.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03


In [25]:
X1=X[["RM","LSTAT"]] #let's work with 2 predictor variables

In [26]:
# Put the target (housing value -- MEDV) in another DataFrame
Y = pd.DataFrame(data.target, columns=["MEDV"])

In [27]:
#split the data
X1_train, X1_test, y_train, y_test = train_test_split(X1, Y, test_size = .2, random_state=25) #20% hold out for testing

In [28]:
regr.fit(X1_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [29]:
Y1_pred = regr.predict(X1_test)

In [30]:
mean_squared_error(y_test, Y1_pred) #MSE

33.28128491726266

In [31]:
print(regr.feature_importances_)

[ 0.70093174  0.29906826]
