# Random Forest From Scratch

In this notebook, we will implement the **Random Forest** algorithm from scratch. We will reuse the `DecisionTree` implementation from our previous work and extend it with bootstrapping and aggregation.

In [1]:
import numpy as np
from collections import Counter
import math
import random

## 1. Decision Tree Implementation (Reused)

First, we include the necessary components for a single Decision Tree: `entropy`, `information_gain`, `Node`, and `DecisionTree`.

In [2]:
def entropy(y):
    counts = Counter(y)
    total = len(y)
    ent = 0
    for count in counts.values():
        p = count / total
        ent -= p * math.log2(p)
    return ent

def information_gain(X, y, feature_index):
    total_entropy = entropy(y)
    values = set(row[feature_index] for row in X)
    weighted_entropy = 0
    
    for value in values:
        sub_y = [
            y[i] for i in range(len(y))
            if X[i][feature_index] == value
        ]
        weighted_entropy += (len(sub_y) / len(y)) * entropy(sub_y)
        
    return total_entropy - weighted_entropy

class Node:
    def __init__(self, feature=None, value=None, label=None):
        self.feature = feature    # feature index
        self.value = value        # feature value for split (categorical here)
        self.label = label        # class label (for leaf)
        self.children = {}

class DecisionTree:
    def __init__(self, max_depth=None, feature_subset_size=None):
        self.max_depth = max_depth
        self.feature_subset_size = feature_subset_size # For Random Forest
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        # 1. Base cases
        if len(set(y)) == 1:
            return Node(label=y[0])
        
        if self.max_depth is not None and depth >= self.max_depth:
            return Node(label=Counter(y).most_common(1)[0][0])
        
        # 2. Feature Selection (Random Forest modification)
        n_features = len(X[0])
        if self.feature_subset_size:
            features_indices = random.sample(range(n_features), min(self.feature_subset_size, n_features))
        else:
            features_indices = range(n_features)

        # 3. Find best feature among selected subset
        best_feature = None
        best_gain = -1

        for i in features_indices:
            gain = information_gain(X, y, i)
            if gain > best_gain:
                best_gain = gain
                best_feature = i
        
        if best_gain == 0:
             return Node(label=Counter(y).most_common(1)[0][0])

        node = Node(feature=best_feature)
        
        feature_values = set(row[best_feature] for row in X)
        for value in feature_values:
            sub_X = []
            sub_y = []
            for i in range(len(X)):
                if X[i][best_feature] == value:
                    sub_X.append(X[i])
                    sub_y.append(y[i])
            
            node.children[value] = self._build_tree(
                sub_X, sub_y, depth + 1
            )
            
        return node

    def predict_one(self, x, node):
        if node.label is not None:
            return node.label
        
        value = x[node.feature]
        child = node.children.get(value)
        
        if child is None:
            # Fallback for unseen value path: return most common or random
            # Simple fix: return None or handle appropriately. 
            # Here we just return None which might fail aggregation if not handled.
            return None 
        
        return self.predict_one(x, child)

    def predict(self, X):
        return [self.predict_one(x, self.root) for x in X]

## 2. Random Forest Implementation

Key components:
1.  **Bootstrapping**: Train each tree on a random sample of the data with replacement.
2.  **Feature Subsampling**: At each split, consider only a random subset of features (implemented inside `DecisionTree` above).
3.  **Aggregation**: Majority vote for classification.

In [3]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=None, feature_subset_size=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.feature_subset_size = feature_subset_size
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples = len(X)
        for _ in range(self.n_trees):
            # Bootstrapping
            indices = [random.randint(0, n_samples - 1) for _ in range(n_samples)]
            sample_X = [X[i] for i in indices]
            sample_y = [y[i] for i in indices]
            
            tree_model = DecisionTree(
                max_depth=self.max_depth, 
                feature_subset_size=self.feature_subset_size
            )
            tree_model.fit(sample_X, sample_y)
            self.trees.append(tree_model)

    def predict(self, X):
        # Collect predictions from all trees
        tree_predictions = []
        for tree_model in self.trees:
            pred = tree_model.predict(X)
            tree_predictions.append(pred)
        
        # Transpose to get predictions per sample: [[p1_t1, p1_t2...], [p2_t1, ...]]
        # Only works if all predictions are valid (not None). 
        # We filter out Nones if any tree couldn't predict.
        
        final_predictions = []
        n_samples = len(X)
        
        for i in range(n_samples):
            sample_votes = []
            for j in range(self.n_trees):
                vote = tree_predictions[j][i]
                if vote is not None:
                    sample_votes.append(vote)
            
            if len(sample_votes) > 0:
                # Majority vote
                final_predictions.append(Counter(sample_votes).most_common(1)[0][0])
            else:
                final_predictions.append(None) # Should not happen typically
                
        return final_predictions

## 3. Testing Random Forest

In [4]:
# Example dataset (Weather Data)
X_train = [
    ['Sunny', 'Hot', 'High', 'Weak'],
    ['Sunny', 'Hot', 'High', 'Strong'],
    ['Overcast', 'Hot', 'High', 'Weak'],
    ['Rain', 'Mild', 'High', 'Weak'],
    ['Rain', 'Cool', 'Normal', 'Weak'],
    ['Rain', 'Cool', 'Normal', 'Strong'],
    ['Overcast', 'Cool', 'Normal', 'Strong'],
    ['Sunny', 'Mild', 'High', 'Weak'],
    ['Sunny', 'Cool', 'Normal', 'Weak'],
    ['Rain', 'Mild', 'Normal', 'Weak'],
    ['Sunny', 'Mild', 'Normal', 'Strong'],
    ['Overcast', 'Mild', 'High', 'Strong'],
    ['Overcast', 'Hot', 'Normal', 'Weak'],
    ['Rain', 'Mild', 'High', 'Strong']
]

y_train = ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']

# Test Data
X_test = [
    ['Sunny', 'Cool', 'Normal', 'Weak'], # Should be Yes
    ['Rain', 'Mild', 'High', 'Strong']   # Should be No
]

# Initialize Random Forest
rf = RandomForest(n_trees=5, max_depth=5, feature_subset_size=2)

# Train
rf.fit(X_train, y_train)

# Predict
predictions = rf.predict(X_test)
print("Predictions:", predictions)

Predictions: ['Yes', 'No']


### Compare with Single Decision Tree

In [5]:
dt = DecisionTree(max_depth=5)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
print("Single Tree Predictions:", dt_preds)

Single Tree Predictions: ['Yes', 'No']
