# Lecture 04 Random Forests

Below is a simplified example of how to implement a basic version of a Random Forest from scratch in Python. This example focuses on core concepts such as bootstrapping, decision tree creation, and majority voting.

We'll make the following assumptions:
- The trees will be simple decision stumps (trees with a depth of 1).
- We'll bootstrap the data for each tree.
- We'll use majority voting for classification.

In [5]:
import numpy as np
from collections import Counter

# Decision Tree Class
class DecisionTree:
    def __init__(self, max_features=None):
        self.max_features = max_features
        self.split_feature = None
        self.split_value = None
        self.left_prediction = None
        self.right_prediction = None

    def fit(self, X, y):
        n_features = X.shape[1]
        # Select random subset of features
        feature_indices = np.random.choice(n_features, self.max_features, replace=False)
        best_gini = float("inf")

        # Find the best split
        for feature in feature_indices:
            values = X[:, feature]
            thresholds = np.unique(values)
            for threshold in thresholds:
                left_mask = values < threshold
                right_mask = ~left_mask
                gini = self._gini_impurity(y[left_mask], y[right_mask])
                if gini < best_gini:
                    best_gini = gini
                    self.split_feature = feature
                    self.split_value = threshold
                    self.left_prediction = self._majority_vote(y[left_mask])
                    self.right_prediction = self._majority_vote(y[right_mask])

    def predict(self, X):
        predictions = []
        for row in X:
            if row[self.split_feature] < self.split_value:
                predictions.append(self.left_prediction)
            else:
                predictions.append(self.right_prediction)
        return np.array(predictions)

    def _gini_impurity(self, left, right):
        n = len(left) + len(right)
        if n == 0:
            return 0
        left_score = 1.0 - sum((np.sum(left == c) / len(left)) ** 2 for c in np.unique(left))
        right_score = 1.0 - sum((np.sum(right == c) / len(right)) ** 2 for c in np.unique(right))
        return (len(left) * left_score + len(right) * right_score) / n

    def _majority_vote(self, y):
        if len(y) == 0:
            return 0
        return Counter(y).most_common(1)[0][0]


# Random Forest Class
class RandomForest:
    def __init__(self, n_estimators=10, max_features=None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        for _ in range(self.n_estimators):
            # Bootstrap sampling
            sample_indices = np.random.choice(n_samples, n_samples, replace=True)
            X_sample, y_sample = X[sample_indices], y[sample_indices]
            # Train a decision tree
            tree = DecisionTree(max_features=self.max_features)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        # Aggregate predictions from all trees
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        # Majority voting
        final_predictions = np.apply_along_axis(lambda x: Counter(x).most_common(1)[0][0], axis=0, arr=tree_predictions)
        return final_predictions


# Example Usage
if __name__ == "__main__":
    # Simple example data: [Open Close High, Low]
    X = np.array([[101.2, 102.1, 102.5, 100.9], [102.2, 103.1, 103.5, 102.2], 
                  [103.2, 102.4, 103.5, 102.2], [104.5, 103.5, 104.75, 102.5], 
                  [104.5, 106.5, 106.9, 103.7], [106.2, 107.8, 108.2, 106.1]])
    y = np.array([1, 1, -1, -1, 1, 1])

    # Instantiate and train the random forest
    clf = RandomForest(n_estimators=5, max_features=2)
    clf.fit(X, y)

    # Predictions
    predictions = clf.predict(X)
    print("Predictions:", predictions)

Predictions: [ 1  1 -1 -1  1  1]


Key Steps:
- Decision Stump: This is a simple classifier that looks for a single best feature and threshold to split the data.
- Random Forest: This class builds multiple decision stumps (trees with depth 1), bootstraps the data, and aggregates the results from all trees using majority voting.

This code is quite basic and can be extended to include more sophisticated decision trees, hyperparameters, and splitting criteria. For instance:
- You can replace DecisionStump with a more advanced DecisionTree class.
- You can add criteria such as Gini impurity for feature selection and threshold splitting.