#DECISION TREES

In [6]:
import numpy as np

def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

def information_gain(y, left_y, right_y):
    parent_entropy = entropy(y)
    n = len(y)
    n_left = len(left_y)
    n_right = len(right_y)
    child_entropy = (n_left / n) * entropy(left_y) + (n_right / n) * entropy(right_y)
    return parent_entropy - child_entropy

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf(self):
        return self.value is not None

class DecisionTree:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        if len(set(y)) == 1 or depth >= self.max_depth:
            return Node(value=np.bincount(y).argmax())
        best_gain = -1
        best_feature = None
        best_thresh = None
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for thresh in thresholds:
                left_mask = X[:, feature] <= thresh
                right_mask = ~left_mask
                if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
                    continue
                gain = information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_thresh = thresh
        if best_gain == -1:
            return Node(value=np.bincount(y).argmax())
        left = self._build_tree(X[X[:, best_feature] <= best_thresh], y[X[:, best_feature] <= best_thresh], depth + 1)
        right = self._build_tree(X[X[:, best_feature] > best_thresh], y[X[:, best_feature] > best_thresh], depth + 1)
        return Node(feature=best_feature, threshold=best_thresh, left=left, right=right)

    def _predict(self, x, node):
        if node.is_leaf():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict(x, node.left)
        else:
            return self._predict(x, node.right)
    def predict(self, X):
        return np.array([self._predict(x, self.root) for x in X])


In [7]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

x, y = load_iris(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=42)

tree = DecisionTree(max_depth=3)
tree.fit(x_train, y_train)

y_pred = tree.predict(x_test)
accuracy = np.mean(y_pred == y_test)
print("Accuracy:", accuracy * 100)


Accuracy: 100.0
