# Random Forest Regression From Scratch

In this notebook, we will implement the **Random Forest Regression** algorithm from scratch. We will adapt the logic from our `DecisionTreeRegressor` and extend it with bootstrapping and aggregation (averaging).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

## 1. Decision Tree Regressor (Adapted for RF)

We need a `DecisionTreeRegressor` that supports optional feature subsampling during splits.

In [None]:
class Node:
    def __init__(self, feature=None, threshold=None, value=None, left=None, right=None):
        self.feature = feature
        self.threshold = threshold
        self.value = value
        self.left = left
        self.right = right

class DecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2, feature_subset_size=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.feature_subset_size = feature_subset_size
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)
        return self

    def _mse(self, y):
        if len(y) == 0: return 0
        return np.mean((y - np.mean(y))**2)

    def _best_split(self, X, y):
        best_gain = float('inf')
        split_idx, split_thresh = None, None
        
        n_samples, n_features = X.shape
        
        # Feature Subsampling for Random Forest
        if self.feature_subset_size:
            f_indices = random.sample(range(n_features), min(self.feature_subset_size, n_features))
        else:
            f_indices = range(n_features)

        for f_idx in f_indices:
            thresholds = np.unique(X[:, f_idx])
            for thresh in thresholds:
                left_mask = X[:, f_idx] <= thresh
                right_mask = ~left_mask
                
                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0: continue
                
                y_left, y_right = y[left_mask], y[right_mask]
                curr_mse = (len(y_left)/n_samples) * self._mse(y_left) + (len(y_right)/n_samples) * self._mse(y_right)
                
                if curr_mse < best_gain:
                    best_gain = curr_mse
                    split_idx = f_idx
                    split_thresh = thresh
                    
        return split_idx, split_thresh

    def _build_tree(self, X, y, depth):
        n_samples = X.shape[0]
        if (self.max_depth and depth >= self.max_depth) or n_samples < self.min_samples_split or len(np.unique(y)) == 1:
            return Node(value=np.mean(y))
        
        idx, thresh = self._best_split(X, y)
        if idx is None: return Node(value=np.mean(y))
        
        left_mask = X[:, idx] <= thresh
        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[~left_mask], y[~left_mask], depth + 1)
        return Node(feature=idx, threshold=thresh, left=left, right=right)

    def _predict_one(self, x, node):
        if node.value is not None: return node.value
        if x[node.feature] <= node.threshold:
            return self._predict_one(x, node.left)
        return self._predict_one(x, node.right)

    def predict(self, X):
        return np.array([self._predict_one(x, self.root) for x in X])

## 2. Random Forest Regressor Implementation

In [None]:
class RandomForestRegressor:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, feature_subset_size=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.feature_subset_size = feature_subset_size
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples = X.shape[0]
        for _ in range(self.n_trees):
            # Bootstrapping
            indices = np.random.choice(n_samples, n_samples, replace=True)
            X_sample, y_sample = X[indices], y[indices]
            
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth, 
                min_samples_split=self.min_samples_split,
                feature_subset_size=self.feature_subset_size
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        return self

    def predict(self, X):
        # Average the predictions from all trees
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

## 3. Testing and Comparison

In [None]:
X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(n_trees=20, max_depth=5, feature_subset_size=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

mse = np.mean((y_test - y_pred)**2)
print(f"Our RF MSE: {mse:.4f}")

from sklearn.ensemble import RandomForestRegressor as SklearnRFR
sk_rf = SklearnRFR(n_estimators=20, max_depth=5, random_state=42)
sk_rf.fit(X_train, y_train)
sk_pred = sk_rf.predict(X_test)
sk_mse = np.mean((y_test - sk_pred)**2)
print(f"Sklearn RF MSE: {sk_mse:.4f}")

## 4. Visualizing Predicted Surface

In [None]:
X_grid = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_grid_pred = rf.predict(X_grid)

plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X_grid, y_grid_pred, color='red', label='RF Prediction')
plt.title("Random Forest Regression")
plt.legend()
plt.show()