In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import math
from collections import Counter
import numpy as np

iris = load_iris()

x = iris.data
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=123)

In [3]:
def entropy_func(class_count, num_samples):
    probability = class_count / num_samples
    entropy = - probability * np.log(probability)
    return entropy

class Group:
    def __init__(self, group_classes):
        self.group_classes = group_classes
        self.entropy = self.group_entropy()

    def __len__(self):
        return self.group_classes.size

    def group_entropy(self):
        entropy = 0
        class_counts = Counter(self.group_classes)
        for group_class, group_class_count in class_counts.items():
            entropy += entropy_func(group_class_count, len(self))
        return entropy


class Node:
    def __init__(self, split_feature, split_val, depth=None, child_node_a=None, child_node_b=None, val=None):
        self.split_feature = split_feature
        self.split_val = split_val
        self.depth = depth
        self.child_node_a = child_node_a
        self.child_node_b = child_node_b
        self.val = val

    def predict(self, data):
        pass


class DecisionTreeClassifier(object):
    def __init__(self, max_depth):
        self.depth = 0
        self.max_depth = max_depth
        self.tree = None

    @staticmethod
    def get_split_entropy(group_a: Group, group_b: Group):
        split_entropy = 0
        parent_group_count = len(group_a) + len(group_b)
        child_groups = [group_a, group_b]
        for group in child_groups:
            split_entropy += (len(group)/parent_group_count) * group.group_entropy()

    def get_information_gain(self, parent_group: Group, child_group_a: Group, child_group_b: Group):
        information_gain = parent_group.group_entropy() - self.get_split_entropy(child_group_a, child_group_b)
        return information_gain

    def get_best_feature_split(self, feature_values, classes):
        pass

    def get_best_split(self, data, classes):
        pass

    def build_tree(self, data, classes, depth=0):

        pass

    def predict(self, data):
        return self.tree.predict(data)

In [51]:
def get_split_entropy(group_a: Group, group_b: Group):
        split_entropy = 0
        parent_group_count = len(group_a) + len(group_b)
        child_groups = [group_a, group_b]
        for group in child_groups:
            split_entropy += (len(group)/parent_group_count) * group.group_entropy()
        return split_entropy

def get_information_gain(parent_group: Group, child_group_a: Group, child_group_b: Group):
    information_gain = parent_group.group_entropy() - get_split_entropy(child_group_a, child_group_b)
    return information_gain

def split(data, classes, split_feature, split_val):
    dataset = np.c_[data, classes]
    feature_column = dataset[:, split_feature].astype(int)
    mask = feature_column > split_val
    
    child_a = dataset[mask]
    child_b = dataset[~mask]
    return child_a, child_b

def get_best_feature_split(feature_values, classes):
        parent = Group(classes)
        possible_thresholds = np.unique(feature_values)
        best_split_val = 0
        best_gain = 0
        
        for threshold in possible_thresholds:
            child_a, child_b = split(feature_values, classes, 0, threshold)
            child_a = Group(child_a[:, -1])
            child_b = Group(child_b[:, -1])
            gain = get_information_gain(parent, child_a, child_b)
            
            if gain > best_gain:
                best_gain = gain
                best_split_val = threshold
        return best_split_val

In [14]:
data = np.transpose(np.array([["A", "B", "B", "B", "B"], [1, 1, 2, 2, 3]]))
classes = np.array([0, 1, 1, 0, 1])

In [54]:
split_val = get_best_feature_split(data[:, 1].astype(int), classes.astype(int))
print(split_val)

2


In [39]:
child_a, child_b = split(data, classes, 1, 2)

In [40]:
child_a, child_b

(array([['B', '3', '1']], dtype='<U21'),
 array([['A', '1', '0'],
        ['B', '1', '1'],
        ['B', '2', '1'],
        ['B', '2', '0']], dtype='<U21'))

In [24]:
result = np.c_[data, classes]

In [25]:
result

array([['A', '1', '0'],
       ['B', '1', '1'],
       ['B', '2', '1'],
       ['B', '2', '0'],
       ['B', '3', '1']], dtype='<U21')

In [28]:
result = result[result[:, 1].argsort()][::-1]

In [29]:
result

array([['B', '3', '1'],
       ['B', '2', '0'],
       ['B', '2', '1'],
       ['B', '1', '1'],
       ['A', '1', '0']], dtype='<U21')

In [37]:
my_general_group = Group(classes)
split1 = Group(np.array([classes[0]]))
split2 = Group(classes[1:])

In [43]:
my_general_group.group_entropy()
split1.group_entropy()
split2.group_entropy()

0.5623351446188083

In [45]:
get_information_gain(my_general_group, split1, split2)

0.22314355131420988

In [None]:
split3 = Group(np.array([classes[0]]))
split4 = Group(classes[1:])

In [46]:
x_train.shape

(135, 4)