In [2]:
import numpy as np
from collections import Counter

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def _entropy(self, y):
        counts = np.bincount(y)
        probabilities = counts / len(y)
        return -np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def _information_gain(self, X, y, feature_idx, threshold):
        parent_entropy = self._entropy(y)

        left_mask = X[:, feature_idx] < threshold
        right_mask = ~left_mask

        n, n_left, n_right = len(y), sum(left_mask), sum(right_mask)

        if n_left == 0 or n_right == 0:
            return 0

        child_entropy = (n_left / n) * self._entropy(y[left_mask]) + \
                       (n_right / n) * self._entropy(y[right_mask])

        return parent_entropy - child_entropy

    def _best_split(self, X, y, feature_indices):
        best_gain = -1
        best_feature, best_threshold = None, None

        for feature_idx in feature_indices:
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature_idx, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0, feature_indices=None):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        # Stopping criteria
        if (depth == self.max_depth or
            n_labels == 1 or
            n_samples < self.min_samples_split):
            return Counter(y).most_common(1)[0][0]

        if feature_indices is None:
            feature_indices = np.random.choice(n_features, int(np.sqrt(n_features)), replace=False)

        best_feature, best_threshold = self._best_split(X, y, feature_indices)

        if best_feature is None:
            return Counter(y).most_common(1)[0][0]

        left_mask = X[:, best_feature] < best_threshold
        right_mask = ~left_mask

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth+1, feature_indices)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth+1, feature_indices)

        return (best_feature, best_threshold, left_subtree, right_subtree)

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _predict_sample(self, x, node):
        if not isinstance(node, tuple):
            return node

        feature_idx, threshold, left, right = node
        if x[feature_idx] < threshold:
            return self._predict_sample(x, left)
        else:
            return self._predict_sample(x, right)

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

class RandomForest:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = np.random.choice(n_samples, n_samples, replace=True)
        return X[indices], y[indices]

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            tree = DecisionTree(max_depth=self.max_depth,
                              min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([Counter(tree_preds[:,i]).most_common(1)[0][0]
                        for i in range(X.shape[0])])

# Example usage
if __name__ == "__main__":
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    # Load dataset
    iris = load_iris()
    X, y = iris.data, iris.target

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train Random Forest
    rf = RandomForest(n_estimators=100, max_depth=3)
    rf.fit(X_train, y_train)

    # Make predictions
    predictions = rf.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, predictions)
    print(f"Random Forest Accuracy: {accuracy:.2f}")

Random Forest Accuracy: 1.00
