In [150]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import math
from collections import Counter
import numpy as np

iris = load_iris()

x = iris.data
y = iris.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=123)

In [218]:
def entropy_func(class_count, num_samples):
    probability = class_count / num_samples
    entropy = - probability * np.log(probability)
    return entropy

def split(data, classes, split_feature, split_val):
    dataset = np.c_[data, classes]
    feature_column = dataset[:, split_feature].astype(float)
    mask = feature_column >= split_val
    
    child_a = dataset[mask]
    child_b = dataset[~mask]
    child_a = np.delete(child_a, split_feature, axis=1)
    child_b = np.delete(child_b, split_feature, axis=1)
    return child_a, child_b

class Group:
    def __init__(self, group_classes):
        self.group_classes = group_classes
        self.entropy = self.group_entropy()

    def __len__(self):
        return len(self.group_classes)

    def group_entropy(self):
        entropy = 0
        class_counts = Counter(self.group_classes)
        num_samples = len(self)
        for group_class_count in class_counts.values():
            entropy += entropy_func(group_class_count, num_samples)
        return entropy

class Node:
    def __init__(self, split_feature=None, split_val=None, depth=None, child_node_a=None, child_node_b=None, val=None):
        self.split_feature = split_feature
        self.split_val = split_val
        self.depth = depth
        self.child_node_a = child_node_a
        self.child_node_b = child_node_b
        self.val = val

    def predict(self, data):
        if self.val is not None:
            return self.val
        elif data[self.split_feature] > self.split_val:
            return self.child_node_a.predict(data)
        else:
            return self.child_node_b.predict(data)

class DecisionTreeClassifier(object):
    def __init__(self, max_depth):
        self.depth = 0
        self.max_depth = max_depth
        self.tree = None

    @staticmethod
    def get_split_entropy(group_a: Group, group_b: Group):
        split_entropy = 0
        parent_group_count = len(group_a) + len(group_b)
        child_groups = [group_a, group_b]
        for group in child_groups:
            split_entropy += (len(group) / parent_group_count) * group.group_entropy()
        return split_entropy

    def get_information_gain(self, parent_group: Group, child_group_a: Group, child_group_b: Group):
        information_gain = parent_group.group_entropy() - self.get_split_entropy(child_group_a, child_group_b)
        return information_gain

    def get_best_feature_split(self, feature_values, classes):
        parent = Group(classes)
        possible_thresholds = np.unique(feature_values)
        best_split_val = 0
        best_gain = 0
        
        #print("Possible", possible_thresholds)
        for threshold in possible_thresholds:
            child_a, child_b = split(feature_values, classes, 0, threshold)
            if child_a.shape[0] == 0 or child_b.shape[0] == 0:
                continue
            child_a = Group(child_a[:, -1])
            child_b = Group(child_b[:, -1])
            gain = self.get_information_gain(parent, child_a, child_b)
            
            if gain >= best_gain:
                best_gain = gain
                best_split_val = threshold
            #print(f"Gain: {gain}")
        return best_split_val

    def build_tree(self, data, classes, depth=0):
        if depth == self.max_depth or len(set(classes)) == 1:
            print(f"Liść: {depth}")
            print(classes)
            # Create a leaf node
            #print("Wtf:", set(classes))
            return Node(val=Counter(classes).most_common(1)[0][0])
        else:
            print("Krawędź")

        best_argument = 0
        best_split = 0
        best_gain = 0
        for argument in range(data.shape[1]):
            #print(data[:, argument])
            split_val = self.get_best_feature_split(data[:, argument], classes)
            #split_val = np.random.choice(data[:, argument])
            child_a, child_b = split(data, classes, argument, split_val)
            child_a = Group(child_a[:, -1])
            child_b = Group(child_b[:, -1])
            gain = self.get_information_gain(Group(classes), child_a, child_b)

            if gain >= best_gain:
                best_gain = gain
                best_argument = argument
                best_split = split_val
        
        print(f"Depth: {depth}\t Best argument: {best_argument}\tBest gain: {best_gain}\tBest split: {best_split}")
        #print("Officially best: ", best_argument)
        child_a_data, child_b_data = split(data, classes, best_argument, best_split)
        child_a_classes = child_a_data[:, -1]
        child_b_classes = child_b_data[:, -1]
        
        #print(f"Child: {child_a_data.shape[1]}")
        child_a_node = self.build_tree(child_a_data[:, :-1], child_a_classes, depth + 1)
        child_b_node = self.build_tree(child_b_data[:, :-1], child_b_classes, depth + 1)

        return Node(split_feature=best_argument, split_val=best_split, depth=depth, child_node_a=child_a_node, child_node_b=child_b_node)

    def fit(self, data, classes):
        self.tree = self.build_tree(data, classes)

    def predict(self, data):
        return self.tree.predict(data)


In [220]:
dc = DecisionTreeClassifier(4)
dc.fit(x_train, y_train)
good = 0
samples = 0
for sample, gt in zip(x_test, y_test):
    prediction = dc.predict(sample)
    if prediction == gt:
        good += 1
    samples += 1

accuracy = (good/samples)
print(f"Accuracy: {accuracy:<2}%")
print([dc.predict(sample) for sample in x_test])

Krawędź
Depth: 0	 Best argument: 3	Best gain: 0.6415255847742902	Best split: 1.0
Krawędź
Depth: 1	 Best argument: 2	Best gain: 0.4778281371342904	Best split: 4.9
Krawędź
Depth: 2	 Best argument: 1	Best gain: 0.023682627817320306	Best split: 3.2
Liść: 3
[2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
Krawędź
Depth: 3	 Best argument: 0	Best gain: 0.024469005380044817	Best split: 7.1
Liść: 4
[2. 2. 2. 2. 2. 2. 2.]
Liść: 4
[2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 2. 1. 2. 2. 2. 2. 2. 2.
 1.]
Krawędź
Depth: 2	 Best argument: 0	Best gain: 0.04599913566448613	Best split: 5.0
Krawędź
Depth: 3	 Best argument: 0	Best gain: 0.015659677412709155	Best split: 2.9
Liść: 4
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Liść: 4
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 2.]
Krawędź
Depth: 3	 Best argument: 0	Best gain: 0.6931471805599453	Best split: 2.5
Liść: 4
[2.]
Liść: 4
[1.]
Liść: 1
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 

In [208]:
dc.predict(x_test[0])

2.0

In [209]:
y_test[0]

1