In [5]:
import numpy as np
import matplotlib.pyplot as plt

In [6]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import random, math
from sklearn.tree import DecisionTreeClassifier
from scipy import stats

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.3, shuffle=True, random_state=42)

In [13]:
class RandomForest:
    def __init__(self,B,bootstrap_ratio):
        self.B = B
        self.bootstrap_ratio = bootstrap_ratio
        self.tree_params = {'max_depth': 2, 'max_features':'sqrt'}
        self.models = [DecisionTreeClassifier(**self.tree_params) for _ in range(B)]
    def fit(self,X_train,y_train):
        m, n = X_train.shape
        sample_size = int(self.bootstrap_ratio * len(X_train))

        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))
        
        x_oob = []  
        y_oob = []

        #subsamples for each model
        for i in range(self.B):
            ##sampling with replacement; i.e., sample can occur more than once
            #for the same predictor
            used_idx = []
            for j in range(sample_size):
                idx = random.randrange(m)   #<----with replacement #change so no repetition
                while idx in used_idx:
                    idx = random.randrange(m)
                xsamples[i, j, :] = X_train[idx]
                ysamples[i, j] = y_train[idx]
                used_idx.append(idx)
                #keep track of idx that i did not use for ith tree
            mask = np.zeros((m))
#             print(mask)
            mask[used_idx] = 1
#             print(mask)
            x_oob.append(X_train[(mask == 0)])
            y_oob.append(y_train[(mask == 0)])
        
        #fitting each estimator
        oob_score = 0
        print("Out of bag score for each tree")
        for i, model in enumerate(self.models):
            
            _X = xsamples[i]
            _y = ysamples[i]
            model.fit(_X, _y)

            #calculating oob score
            _X_test = np.array(x_oob[i])
            _y_test = np.array(y_oob[i])
            yhat = model.predict(_X_test)
            oob_score += accuracy_score(_y_test, yhat)
            print(f"Tree {i}", accuracy_score(_y_test, yhat))
        avg_oob_score = oob_score / len(self.models)
        print("Average out of bag score")
        print(avg_oob_score)

            
    def predict(self,X_test):
        predictions = np.zeros((self.B, X_test.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X_test)
            predictions[i, :] = yhat
#             print(predictions.shape)
        return stats.mode(predictions)[0][0]

In [14]:
model = RandomForest(B=5, bootstrap_ratio=0.8)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
# print(yhat)
print(classification_report(y_test, yhat))

Out of bag score for each tree
Tree 0 0.7619047619047619
Tree 1 0.9523809523809523
Tree 2 0.9523809523809523
Tree 3 0.9523809523809523
Tree 4 0.9047619047619048
Average out of bag score
0.9047619047619048
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45

