Random Forest steps/Processes:

1. Boostraping
2. Decsion tree creation
3. 100s of DT
4. All DT's predcits
5. Most common Prediction


In [None]:
# Import python libraries
import numpy as np

In [None]:
# Class Node (Root node, internal/decision node and leaf node)
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf(self):
        return self.value is not None

In [None]:
# Create a class DecisionTree that contains all methods used for prediciton
class DecisionTree:

    def __init__(self):
        self.root = None

    def fit(self, x, y):
        self.root = self.build_tree(x, y)

    def most_common_label(self, y):
        labels, counts = np.unique(y, return_counts=True)
        return labels[np.argmax(counts)]

    def split(self, x_column, split_threshold):
        left = np.argwhere(x_column <= split_threshold).flatten()
        right = np.argwhere(x_column > split_threshold).flatten()
        return left, right

    def entropy(self, y):
        fid3 = np.mean(y)
        if fid3 == 0 or fid3 == 1:
            return 0
        else:
            return -fid3 * np.log(fid3) - (1 - fid3) * np.log(1 - fid3)

    def information_gain(self, x_column, y, threshold):
        left, right = self.split(x_column, threshold)
        if len(left) == 0 or len(right) == 0:
            return 0
        else:
            p_left = len(left) / len(y)
            p_right = len(right) / len(y)
            weighted_entropy = p_left * self.entropy(y[left]) + p_right * self.entropy(y[right])
            return weighted_entropy

    def best_split(self, x, y):
        num_sample, num_feature = x.shape
        best_gain = float("inf")
        split_feature = None
        split_threshold = None

        for feature in range(num_feature):
            x_column = x[:, feature]
            x_column_sorted = np.sort(x_column)
            threashold = (x_column_sorted[:-1] + x_column_sorted[1:]) / 2
            for th in threashold:
                weighted_entropy = self.information_gain(x_column, y, th)
                # print(f"Threshold {th} : {weighted_entropy}")

                if weighted_entropy < best_gain:
                    best_gain = weighted_entropy
                    split_feature = feature
                    split_threshold = th

        return split_feature, split_threshold

    def build_tree(self, x, y):
        n_lables = len(np.unique(y))
        if n_lables == 1:
            leaf_value = self.most_common_label(y)
            return Node(value=leaf_value)

        best_feature, best_threshold = self.best_split(x, y)
        left_idxs, right_idxs = self.split(x[:, best_feature], best_threshold)
        left = self.build_tree(x[left_idxs, :], y[left_idxs])
        right = self.build_tree(x[right_idxs, :], y[right_idxs])
        return Node(best_feature, best_threshold, left, right)

    def traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value

        if x[node.feature] <= node.threshold:
            return self.traverse_tree(x, node.left)
        else:
            return self.traverse_tree(x, node.right)

    def predict(self, x):
        predictions = np.array([self.traverse_tree(x, self.root) for x in x])
        return predictions

In [None]:
#Create a class RandomForest that contains all methods used for prediction
class RandomForest:
  def __init__(self, n_estimators = 100): #n_estimators = n_trees
    self.n_estimators = n_estimators
    self.trees = []

  def fit(self, x, y):
    self.trees = []
    for i in range(self.n_estimators):
      tree = DecisionTree()
      X_samples, Y_samples = self.bootstrap_sample(x, y)
      tree.fit(X_samples, Y_samples)
      self.trees.append(tree)

  def predict(self, x):
    predictions = []
    for tree in self.trees:
      prediciton = tree.predict(x)
      predictions.append(prediciton)

    predictions = np.swapaxes(predictions, 0, 1) #used for multi dimensions while transvers is used for 2d

    preds = []
    for prediciton in predictions:
      preds.append(self.most_common_label(prediciton))

    return preds

  def most_common_label(self, y):
    labels, counts = np.unique(y, return_counts=True)
    return labels[np.argmax(counts)]

  def bootstrap_sample(self, x, y):
    num_samples, num_features = x.shape
    indices = np.random.choice(num_samples, num_samples, replace = True)
    return x[indices], y[indices]


