## Decision Tree (classification)

### Gini impurity

 * with $c$ classes:
$$I_G(p)=1 - \sum_{j=1}^c p_j^2$$

In [1]:
import math
from collections import Counter

import numpy as np

In [2]:

def calc_gini(y):
    m = len(y)
    if m == 0:
        return 0
    counts = Counter(y)
    probas = [c/m for c in counts.values()]
    impurity = 1 - sum([p**2 for p in probas])
    return impurity

# testing the function
y = [1, 1, 1, 1, 0, 0, 0]
print(calc_gini(y))

0.48979591836734704


In [3]:
def entropy(y):
    m = len(y)
    counts = Counter(y)
    probas = [c/m for c in counts.values()]
    return -sum([p * math.log(p, 2) for p in probas])

# testing the function
y = [1, 1, 1, 1, 0, 0, 0]
print(entropy(y))

0.9852281360342516


In [None]:
def split_dataset(X, feature_index, threshold):
    left_indices = np.where(X[:, feature_index] < threshold)[0]
    right_indices = np.where(X[:, feature_index] >= threshold)[0]

    return left_indices, right_indices

# testing the function
X = np.array([[-1, 2], [3, 4], [3, 6], [-2, 8]])
feature_index = 0
threshold = 0
left, right = split_dataset(X, feature_index, threshold)
print(left, right)

[0 3] [1 2]


In [14]:
def gini_split(X, y, feature_index, threshold):
    left_indices, right_indices = split_dataset(X, feature_index, threshold)
    left_y, right_y = y[left_indices], y[right_indices]
    m = len(y)
    w_left, w_right = len(left_y) / m, len(right_y) / m
    gini_left = calc_gini(left_y)
    gini_right = calc_gini(right_y)
    gini = (w_left/m) * gini_left + (w_right/m) * gini_right
    return gini

# testing the function
y = np.array([1, 1, 0, 0])
print(gini_split(X, y, feature_index, threshold))
y = np.array([0, 1, 1, 0])
print(gini_split(X, y, feature_index, threshold))

0.125
0.0
