# Decision Trees
---

## 00. Imports

In [1]:
import numpy as np

## 01. Gini Index

In [2]:
def group_gini_index(samples: np.array) -> float:
    n_samples = samples.shape[0]
    _, n_groups = np.unique(samples, return_counts=True)
    return 1 - sum(map(lambda x: (x/n_samples)**2, n_groups))

In [4]:
def groups_gini_index(first: np.array, second: np.array) -> float:
    n_first = first.shape[0]
    n_second = second.shape[0]
    n_total = n_first + n_second
    return group_gini_index(first) * n_first / n_total + \
           group_gini_index(second) * n_second / n_total

## 02. Split

In [32]:
def find_split(X: np.array, y: np.array):
    n_samples, n_features = X.shape
    split_feature, split_value, best_gini, X_left, X_right, y_left, y_right = \
        None, None, None, None, None, None, None
    
    for feature_idx in range(n_features):
        order = X[:, feature_idx].argsort()
        X_sorted = X[order]
        y_sorted = y[order]
        
        for sample_idx in range(1, n_samples):
            y_left_, y_right_ = np.split(y_sorted, [sample_idx])
            gini = groups_gini_index(y_left_, y_right_)
            
            if best_gini == None or gini < best_gini:
                best_gini = gini
                split_feature = feature_idx
                split_value = (X_sorted[sample_idx, feature_idx] + X_sorted[sample_idx-1, feature_idx])/2
                y_left, y_right = y_left_, y_right_
                X_left, X_right = np.split(X_sorted, [sample_idx])
            
    return split_feature, split_value, best_gini, X_left, X_right, y_left, y_right

## 03. Tree structure

In [35]:
class Node():
    def __init__(self, label = None, split_feature = None, split_value = None):
        self.label = label
        self.split_feature = split_feature
        self.split_value = split_value
        self.left = None
        self.right = None

## 04. Load test data set

In [33]:
FEATURES_COLUMN_NAMES = [
    "sepal_length", 
    "sepal_width", 
    "petal_length", 
    "petal_width"
]

LABELS_COLUMN_NAMES = [
    "species"
]

COLUMN_NAMES = FEATURES_COLUMN_NAMES + \
    LABELS_COLUMN_NAMES

In [23]:
import pandas as pd
df = pd.read_csv('../data_sets/iris.csv', names=COLUMN_NAMES)
X = df[FEATURES_COLUMN_NAMES].values
y = df[LABELS_COLUMN_NAMES].values

In [27]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y.ravel())

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)