# Decision Trees
---

## 00. Imports

In [1]:
import numpy as np
from typing import Tuple, List, Union

## 01. Gini Index

In [2]:
def group_gini_index(samples: np.array) -> float:
    n_samples = samples.shape[0]
    _, n_groups = np.unique(samples, return_counts=True)
    return 1 - sum(map(lambda x: (x/n_samples)**2, n_groups))

In [3]:
def groups_gini_index(first: np.array, second: np.array) -> float:
    n_first = first.shape[0]
    n_second = second.shape[0]
    n_total = n_first + n_second
    return group_gini_index(first) * n_first / n_total + \
           group_gini_index(second) * n_second / n_total

## 02. Split

In [95]:
def find_split(X: np.array, y: np.array):
    n_samples, n_features = X.shape
    split_feature, split_value, best_gini, X_left, X_right, y_left, y_right = \
        None, None, None, None, None, None, None
    
    for feature_idx in range(n_features):
        order = X[:, feature_idx].argsort()
        X_sorted = X[order]
        y_sorted = y[order]
        
        for sample_idx in range(1, n_samples):
            y_left_, y_right_ = np.split(y_sorted, [sample_idx])
            gini = groups_gini_index(y_left_, y_right_)
            
            if best_gini is None or gini < best_gini:
                best_gini = gini
                split_feature = feature_idx
                split_value = (X_sorted[sample_idx, feature_idx] + X_sorted[sample_idx - 1, feature_idx])/2
                y_left, y_right = y_left_, y_right_
                X_left, X_right = np.split(X_sorted, [sample_idx])
            
    return split_feature, split_value, X_left, X_right, y_left, y_right

## 03. Tree structure

In [5]:
class Node:
    def __init__(
        self, assigned_label: int = None, 
        split_feature: int = None, 
        split_value: float = None
    ) -> None:
        self.assigned_label = assigned_label
        self.split_feature = split_feature
        self.split_value = split_value
        self.left_child = None
        self.right_child = None

In [6]:
class TreeClassifier:
    def __init__(self, max_depth: int, min_samples_split: int) -> None:
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def fit(self, X: np.array, y: np.array) -> None:
        self.root = Node()
        self.__split(X, y, self.root, 1)
        
    def predict(self, X: np.array) -> np.array:
        predictions = np.empty(X.shape[0])
        for idx, x in enumerate(X):
            predictions[idx] = self.__single_example_prdiction(x)
        return predictions
        
    def __split(self, X: np.array, y: np.array, node: Node, depth: int) -> None:
        n_samples = y.shape[0]
        
        if n_samples <= self.min_samples_split or depth >= self.max_depth:
            unique, counts = np.unique(y, return_counts=True)
            node.assigned_label = unique[np.argmax(counts)]
            return
        
        node.split_feature, node.split_value, X_left, X_right, y_left, y_right = find_split(X, y)
        node.left_child, node.right_child = Node(), Node()
        
        self.__split(X_left, y_left, node.left_child, depth + 1)
        self.__split(X_right, y_right, node.right_child, depth + 1)
        
    def __single_example_prdiction(self, x: np.array) -> int:
        node = self.root
        while node.assigned_label == None:
            if x[node.split_feature] > node.split_value:
                node = node.right_child
            else:
                node = node.left_child
        return node.assigned_label

## 04. Random Forest

In [81]:
class RandomForestClassifier():
    def __init__(self, n_estimators: int, fraction: float, max_depth: int, min_samples_split: int):
        self.n_estimators = n_estimators
        self.fraction = fraction
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def fit(self, X: np.array, y: np.array) -> None:
        self.estimators = [TreeClassifier(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
                          for _ in range(self.n_estimators)]
        bags = self.__create_bags(X, y, self.n_estimators, self.fraction)
        for estimator, (X_bag, y_bag) in zip(self.estimators, bags):
            estimator.fit(X_bag, y_bag)
        
    def predict(self, X: np.array) -> np.array:
        n_samples = X.shape[0]
        predictions_raw = np.empty((self.n_estimators, n_samples))
        predictions = np.empty((n_samples))
        for i in range(self.n_estimators):
            pred = self.estimators[i].predict(X)
            predictions_raw[i] = pred
        for i in range(n_samples):
            predictions[i] = self.__get_most_frequent_element_value(predictions_raw[:, i])
        return predictions
    
    def __create_bags(self, X: np.array, y: np.array, n_bags: int, fraction: float) -> List[Tuple[np.array, np.array]]:
        n_samples = X.shape[0]
        bags = []
        for _ in range(n_bags):
            indexes = np.random.choice(n_samples, int(n_samples * fraction), replace = True)
            bags.append((X[indexes], y[indexes]))
        return bags
    
    def __get_most_frequent_element_value(self, pred: np.array) -> Union[int, float]:
        values, counts = np.unique(pred, return_counts=True)
        idx = np.argmax(counts)
        return values[idx]  

## 05. Test using 'Iris' data set

### 05.00.  Imports

In [82]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### 05.01. Settings

In [83]:
FEATURES_COLUMN_NAMES = [
    "sepal_length", 
    "sepal_width", 
    "petal_length", 
    "petal_width"
]

LABELS_COLUMN_NAMES = [
    "species"
]

COLUMN_NAMES = FEATURES_COLUMN_NAMES + \
    LABELS_COLUMN_NAMES

DATA_SET_PATH = "../data_sets/iris.csv"

### 05.02. Load

In [84]:
df = pd.read_csv(DATA_SET_PATH, names=COLUMN_NAMES)
X = df[FEATURES_COLUMN_NAMES].values
y = df[LABELS_COLUMN_NAMES].values

### 05.03. Feature engineering

In [85]:
le = LabelEncoder()
y = le.fit_transform(y.ravel())

### 05.04. Data split

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

### 05.05. Train Single Tree

In [87]:
tree_classifier = TreeClassifier(max_depth=10, min_samples_split=2)
tree_classifier.fit(X_train, y_train)

### 05.06. Predict Single Tree

In [88]:
y_pred = tree_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {0:.2f}%".format(acc * 100))

Accuracy: 86.84%


### 05.07. Train Random Forest

In [93]:
rf_classifier = RandomForestClassifier(n_estimators=20, fraction=0.7, max_depth=10, min_samples_split=2)
rf_classifier.fit(X_train, y_train)

### 05.08. Predict Random Forest

In [94]:
y_pred = rf_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {0:.2f}%".format(acc * 100))

Accuracy: 94.74%


## 05. Test using 'Wine' data set

### 05.01. Settings

In [14]:
FEATURES_COLUMN_NAMES = [
    "fixed_acidity", 
    "volatile_acidity",
    "citric_acid",
    "residual_sugar",
    "chlorides",
    "free_sulfur_dioxide",
    "total_sulfur_dioxide",
    "density",
    "ph",
    "sulphates",
    "alcohol"
]

LABELS_COLUMN_NAMES = [
    "quality"
]

COLUMN_NAMES = FEATURES_COLUMN_NAMES + \
    LABELS_COLUMN_NAMES

DATA_SET_PATH = "../data_sets/winequality-white.csv"

### 05.02. Load

In [15]:
df = pd.read_csv(DATA_SET_PATH, sep=";")
X = df[FEATURES_COLUMN_NAMES].values
y = df[LABELS_COLUMN_NAMES].values

### 05.03. Data split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

### 05.04. Train

In [17]:
tree_classifier = TreeClassifier(max_depth=10, min_samples_split=2)
tree_classifier.fit(X_train, y_train)

### 05.05. Predict

In [18]:
y_pred = tree_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {0:.2f}%".format(acc * 100))

Accuracy: 53.39%


In [39]:
rf_classifier = RandomForestClassifier(n_estimators=10, fraction=0.6, max_depth=10, min_samples_split=2)

In [40]:
rf_classifier.fit(X_train, y_train)

In [41]:
y_pred = rf_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {0:.2f}%".format(acc * 100))

Accuracy: 54.04%


In [42]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
skl_rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=10, min_samples_split=2, max_samples=0.6)

In [78]:
skl_rf_classifier.fit(X_train, np.ravel(y_train))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=0.6,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [79]:
y_pred = skl_rf_classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy: {0:.2f}%".format(acc * 100))

Accuracy: 58.12%
