In [7]:
import numpy as np
import scipy as sp
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import cross_val_score

digits = load_digits()

In [9]:
# Normal Decision Tree
tree = DecisionTreeClassifier()
cross_val_score(tree, digits.data, digits.target, cv=10, scoring="accuracy").mean()

0.8258069522036002

In [11]:
# Scikit-learn implementation of Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
cross_val_score(rf, digits.data, digits.target, cv=10, scoring="accuracy").mean()

0.949897579143389

In [12]:
# Your implementation of Random Forest
class MyRandomForest(BaseEstimator):

    def __init__(self, n_estimators=100):
        self.n_estimators = n_estimators # number of trees to fit
        self.trees = []                  # list to store the fitted trees
        
    def fit(self, X, y):
        # fit n_estimators DecisionTreeClassifiers (with max_features="sqrt")
        # on a randomized bootstrap of the data
        
        # hint: use function numpy.random.choice() to generate boostrap sample
        # hint: for loop
        # hint: self.trees.append()
        
        n = len(X)
        
        for i in range(self.n_estimators):
            ix = np.random.choice(n, n)
            tree = DecisionTreeClassifier(max_features="sqrt")
            tree.fit(X[ix,:], y[ix])
            self.trees.append(tree)
        
        return self
        
    def predict(self, X):
        # Use the DecisionTreeClassifiers to predict values
        # hint: loop over fitted trees
        # hint: store all predictions in a matrix,
        #       then determine majority vote with function scipy.stats.mode() (see cell below for an example)
        
        predictions = np.zeros((len(X), self.n_estimators))
        
        for i in range(self.n_estimators):
            predictions[:,i] = self.trees[i].predict(X)        
        
        return sp.stats.mode(predictions, axis = 1)[0].ravel()

In [13]:
predictions = np.random.choice(10, (20, 5)) # 20 data points, 5 trees
print(predictions)
sp.stats.mode(predictions, axis = 1)[0].ravel()

[[9 0 3 5 3]
 [1 1 1 6 8]
 [5 7 2 8 1]
 [6 4 6 4 5]
 [3 0 8 8 1]
 [5 0 6 2 7]
 [0 2 5 2 3]
 [7 4 4 5 6]
 [2 8 5 5 3]
 [2 4 2 5 5]
 [8 1 7 5 1]
 [9 3 9 4 4]
 [0 9 5 3 1]
 [5 9 3 8 5]
 [4 4 7 6 1]
 [0 7 0 6 4]
 [5 7 9 7 6]
 [4 1 1 0 6]
 [4 0 7 8 1]
 [1 8 9 5 6]]


array([3, 1, 1, 4, 8, 0, 2, 4, 5, 2, 1, 4, 0, 5, 4, 0, 7, 1, 0, 1])

In [14]:
# Test MyRandomForest
mrf = MyRandomForest()
cross_val_score(mrf, digits.data, digits.target, cv=10, scoring="accuracy").mean()

0.9526939788950962