## Random forest

### Implementation

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.tree import DecisionTreeClassifier

###Bagging

In [2]:
def bag(X, y):
    
    n_samples = X.shape[0]

    #Generate a random indices for a sample from the input
    indices = np.random.choice(n_samples, size = n_samples, replace=True)
    
    return X[indices], y[indices]

In [3]:
def most_common_label(y):
    counter = Counter(y)
    most_common = counter.most_common(1)[0][0]
    return most_common

### Random forest class

In [4]:
class RandomForest:
    def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, max_features=None):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        
        for _ in range(self.n_trees):
            tree = DecisionTreeClassifier(
                    min_samples_split = self.min_samples_split,
                    max_depth = self.max_depth,
                    max_features= self.max_features)
            
            X_sample, y_sample = bag(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_predict = np.array([tree.predict(X) for tree in self.trees])
        tree_predict = np.swapaxes(tree_predict, 0, 1)
        y_pred = [most_common_label(tree_pred) for tree_pred in tree_predict]
        return np.array(y_pred)

## Demonstration on breast cancer dataset

In [5]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

def accuracy(y_true, y_pred):
    acc = np.sum(y_true == y_pred) / len(y_true)
    return acc

X, y = load_breast_cancer(as_frame=True, return_X_y = True)

X = X.to_numpy()
y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

clf = RandomForest(n_trees=10, max_depth=10)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)

print("Accuracy: ", acc)

Accuracy:  0.9736842105263158


In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        42
           1       0.96      1.00      0.98        72

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114

