# Application of Bootstrap samples in Random Forest

In [1]:
import numpy as np
import pandas as pd
import random 
from tqdm import tqdm
from statistics import mean
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

 <li> Load the boston house dataset </li>

In [2]:
boston = load_boston()
x=boston.data #independent variables
y=boston.target #target variable
df = pd.DataFrame(x)
df['y'] = y
df,df.shape

(           0     1      2    3      4      5     6       7    8      9    10  \
 0    0.00632  18.0   2.31  0.0  0.538  6.575  65.2  4.0900  1.0  296.0  15.3   
 1    0.02731   0.0   7.07  0.0  0.469  6.421  78.9  4.9671  2.0  242.0  17.8   
 2    0.02729   0.0   7.07  0.0  0.469  7.185  61.1  4.9671  2.0  242.0  17.8   
 3    0.03237   0.0   2.18  0.0  0.458  6.998  45.8  6.0622  3.0  222.0  18.7   
 4    0.06905   0.0   2.18  0.0  0.458  7.147  54.2  6.0622  3.0  222.0  18.7   
 ..       ...   ...    ...  ...    ...    ...   ...     ...  ...    ...   ...   
 501  0.06263   0.0  11.93  0.0  0.573  6.593  69.1  2.4786  1.0  273.0  21.0   
 502  0.04527   0.0  11.93  0.0  0.573  6.120  76.7  2.2875  1.0  273.0  21.0   
 503  0.06076   0.0  11.93  0.0  0.573  6.976  91.0  2.1675  1.0  273.0  21.0   
 504  0.10959   0.0  11.93  0.0  0.573  6.794  89.3  2.3889  1.0  273.0  21.0   
 505  0.04741   0.0  11.93  0.0  0.573  6.030  80.8  2.5050  1.0  273.0  21.0   
 
          11    12     y  

### Task: 1
<font color='red'><b>Step 1 Creating samples: </b></font> Randomly create 30 samples from the whole boston data points.
<ol>
<li>Creating each sample: Consider any random 303(60% of 506) data points from whole data set and then replicate any 203 points from the sampled points</li>
<li>Ex: For better understanding of this procedure lets check this examples, assume we have 10 data points [1,2,3,4,5,6,7,8,9,10], first we take 6 data points randomly consider we have selected [4, 5, 7, 8, 9, 3] now we will replciate 4 points from [4, 5, 7, 8, 9, 3], consder they are [5, 8, 3,7] so our final sample will be [4, 5, 7, 8, 9, 3, 5, 8, 3,7]</li>
<li> we create 30 samples like this </li>
<li> Note that as a part of the Bagging when you are taking the random samples make sure each of the sample will have                different set of columns</li>
<li> Ex: assume we have 10 columns for the first sample we will select [3, 4, 5, 9, 1, 2] and for the second sample [7, 9, 1, 4, 5, 6, 2] and so on...</li>
<li> Make sure each sample will have atleast 3 feautres/columns/attributes</li>
</ol>

<font color='red'><b>Step 2 Building High Variance Models on each of the sample and finding train MSE value:</b></font> Build a DecisionTreeRegressor on each of the sample.
<ol><li>Build a regression trees on each of 30 samples.</li>
<li>computed the predicted values of each data point(506 data points) in your corpus.</li>
<li> predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{30}\sum_{k=1}^{30}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $MSE =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})$.</li>
</ol>

<font color='red'><b>Step 3 Calculating the OOB score :</b></font>
<ol>
<li>Computed the predicted values of each data point(506 data points) in your corpus.</li>
<li>Predicted house price of $i^{th}$ data point $y^{i}_{pred} =  \frac{1}{k}\sum_{\text{k= model which was buit on samples not included } x^{i}}(\text{predicted value of } x^{i} \text{ with } k^{th} \text{ model})$.</li>
<li>Now calculate the $OOB Score =  \frac{1}{506}\sum_{i=1}^{506}(y^{i} - y^{i}_{pred})$.</li>
</ol>

### Task: 2
<pre>
<font color='red'><b>Computing CI of OOB Score and Train MSE</b></font>
<ol>
<li> Repeat Task 1 for 35 times, and for each iteration store the Train MSE and OOB score </li>
<li> After this we will have 35 Train MSE values and 35 OOB scores </li>
<li> using these 35 values (assume like a sample) find the confidence intravels of MSE and OOB Score </li>
<li> you need to report CI of MSE and CI of OOB Score </li>
<li> Note: Refer the Central_Limit_theorem.ipynb to check how to find the confidence intravel</li>
</ol>
</pre>
### Task: 3
<pre>
<font color='red'><b>Given a single query point predict the price of house.</b></font>

<li>Consider xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60] Predict the house price for this point as mentioned in the step 2 of Task 1. </li>
</pre>

In [3]:
tr_x,t_x,tr_y,t_y = train_test_split(df.iloc[:,:-1],df.iloc[:,-1],train_size=303)

In [4]:
tr_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
126,0.38735,0.0,25.65,0.0,0.5810,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26
132,0.59005,0.0,21.89,0.0,0.6240,6.372,97.9,2.3274,4.0,437.0,21.2,385.76,11.12
349,0.02899,40.0,1.25,0.0,0.4290,6.939,34.5,8.7921,1.0,335.0,19.7,389.85,5.89
278,0.07978,40.0,6.41,0.0,0.4470,6.482,32.1,4.1403,4.0,254.0,17.6,396.90,7.19
451,5.44114,0.0,18.10,0.0,0.7130,6.655,98.2,2.3552,24.0,666.0,20.2,355.29,17.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.00632,18.0,2.31,0.0,0.5380,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
500,0.22438,0.0,9.69,0.0,0.5850,6.027,79.7,2.4982,6.0,391.0,19.2,396.90,14.33
203,0.03510,95.0,2.68,0.0,0.4161,7.853,33.2,5.1180,4.0,224.0,14.7,392.78,3.81
277,0.06127,40.0,6.41,1.0,0.4470,6.826,27.6,4.8628,4.0,254.0,17.6,393.45,4.16


In [5]:
mse = []
oob =[]
for x in range(1):
    a_clf={}
    index = dict()
    pred_plus = np.zeros(506)
    for i in tqdm(range(1,31)):
        c = random.randint(4,10)
        df2 = df.iloc[:,:-1].sample(n=c,axis=1)
        df2 = df2.reindex(sorted(df2.columns), axis=1)
        df2['y'] = df['y']
        df3 = df2.sample(n=303).sort_index()
        index[i] ={
            'row':list(df3.index),
            'col':list(df2.columns)
        }
        df3 = pd.concat([df3,df3.sample(n=203)]).sort_index()
        # gs_clf = GridSearchCV(estimator = DecisionTreeRegressor(),param_grid ={'max_depth':[5,10,15,20,30],'min_samples_split':[10,20,30,40,50]})
        clf= DecisionTreeRegressor(max_depth=100,min_samples_split=3) ## Taking high variance models as we are here to imitate the Bagging-RandomForest, which reduces the variance of the models.
        # gs_clf.fit(df3.iloc[:,:-1],df3.iloc[:,-1])
        clf.fit(df3.iloc[:,:-1],df3.iloc[:,-1])
        # clf = gs_clf.best_estimator_
        a_clf[i] = clf
        pred_plus = pred_plus + clf.predict(df2.iloc[:,:-1])
    MSe = sum((pred_plus/30-df2.iloc[:,-1])**2)/506

    # OOB scor calculation
    oob_pred =[]
    pl =[]
    for ind,dat in tqdm(df.iterrows()):
        pred_val = []
        for i in range(1,31):
            if ind not in index[i]['row']:
                val = a_clf[i].predict(dat[index[i]['col']].iloc[:-1].to_numpy().reshape(1,-1))
                pred_val.append(val[0])
        pl.append(len(pred_val))
        oob_pred.append(mean(pred_val))
    OOb = sum(oob_pred-df['y'])/506

    mse.append(MSe)
    oob.append(OOb)
mse,oob

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 72.20it/s]
506it [00:05, 98.51it/s] 


([2.1838841440186907], [0.007076970434703175])

In [6]:
mse,oob

([2.1838841440186907], [0.007076970434703175])

In [7]:
xq= [0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60]
x_q = pd.Series(xq)
prediction = []
for i in range(1,31):
    prediction.append(a_clf[i].predict(x_q.iloc[index[i]['col'][:-1]].to_numpy().reshape(1,-1))[0])
mean(prediction)

19.811666666666667