In [59]:
import numpy as np

# Decision Tree Node

In [139]:
class Node:
    # chose-able feature in this node, 1 is chose-able
    features = []
    # chosen feature id
    feature_id = -1
    # chosen feature name
    feature_name = ''
    # the data needs to make decision in this node
    data = None
    data_left = None
    data_right = None
    # the entrophy of the self.data
    entrophy = 1
    # the entrophy of split data
    entrophy_sub = 1
    # node depth
    depth = 1
    # node type 0:mid\ 1:leaf
    node_type = 0
    # threshold
    threshold = 0.2
    max_depth = 5
    # sub nodes
    node_left = None
    node_right = None
    # for leaf node
    result = -1

    def H(self, x: float):
        """
        calculate the entropy of the number
        :param x: the positive ratio
        :return: the H(x), entropy of x
        """
        if x == 0.0 or x == 1.0:
            return 0
        h = - x * np.log2(x) - (1 - x) * np.log2(1 - x)
        return h

    def calculate_sub_entrophy(self, data_pos, data_neg) -> float:
        """
        calculate the entrophy of spliting by a feature
        :param data_pos: pos data of the feature
        :param data_neg: neg data of the feature
        :return: entrophy
        """
        pos_pos = 0
        neg_pos = 0
        pos_amount = len(data_pos)
        neg_amount = len(data_neg)
        total_amount = pos_amount + neg_amount
        wp = pos_amount / total_amount
        wn = neg_amount / total_amount
        for data_id in range(pos_amount):
            if data_pos[data_id][-1] == 1:
                pos_pos += 1
        for data_id in range(neg_amount):
            if data_neg[data_id][-1] == 1:
                neg_pos += 1
        p1 = pos_pos / pos_amount
        p2 = neg_pos / neg_amount
        entrophy = wp * self.H(p1) + wn * self.H(p2)
        return entrophy

    def choose_feature(self):
        """
        Choose the best feature to split the data in this node.
        :update:
                - self.features
                - self.feature_id
                - self.feature_name
                - self.entrophy_sub
        :return: None
        """
        if self.entrophy == 0:
            self.result = self.data[0][-1]
            self.node_type = 1
            return
        feature_len = len(self.features)
        data_len = len(self.data)
        entrophy_min = 1
        chosen_feature_id = -1
        data_left = None
        data_right = None
        for feature_id in range(feature_len):
            # feature haven't been used yet
            if self.features[feature_id] == 1:
                data_neg = []
                data_pos = []
                for data_id in range(data_len):
                    if self.data[data_id][feature_id] == 0:
                        data_neg.append(self.data[data_id])
                    else:
                        data_pos.append(self.data[data_id])
                entrophy = self.calculate_sub_entrophy(data_pos, data_neg)
                # update the chosen feature
                if entrophy < entrophy_min:
                    chosen_feature_id = feature_id
                    entrophy_min = entrophy
                    data_left = data_pos
                    data_right = data_neg
        self.data_left = data_left
        self.data_right = data_right
        self.feature_id = chosen_feature_id
        if self.feature_id >= 0:
            self.feature_name = feature_name[chosen_feature_id]
        self.features[chosen_feature_id] = 0
        self.entrophy_sub = entrophy_min
        information_gain = self.entrophy - entrophy_min
        if information_gain <= self.threshold:
            self.node_type = 1
            # here may exist bug
            self.result = self.data[0][-1]

    def calculate_self_entrophy(self) -> float:
        """
        calculate the entrophy, base on self data
        :return: entrophy
        """
        data_amount = len(self.data)
        pos_count = 0
        for data_id in range(data_amount):
            if self.data[data_id][-1] == 1:
                pos_count += 1
        p = pos_count / data_amount
        entrophy = self.H(p)
        self.entrophy = entrophy
        return entrophy

    def split(self):
        self.node_left = None
        self.node_right = None
        if self.depth == self.max_depth \
                or \
                self.node_type == 1:
            return self.node_left, self.node_right
        self.node_left = Node(self.data_left, self.features)
        self.node_right = Node(self.data_right, self.features)
        self.node_left.depth = self.depth + 1
        self.node_right.depth = self.depth + 1
        self.node_left.max_depth = self.max_depth
        self.node_right.max_depth = self.max_depth
        self.node_left.threshold = self.threshold
        self.node_right.threshold = self.threshold
        return self.node_left, self.node_right

    def __init__(self, data, features):
        self.features = np.copy(features)
        self.data = data
        self.calculate_self_entrophy()
        self.choose_feature()

    def __str__(self):
        if self.node_type == 0:
            return f"feature: {self.feature_name}"
            # return f"data:\n{self.data}\n" \
            #    f"features:\n{self.features}\n" \
            #    f"feature id:\t{self.feature_id}\n" \
            #    f"feature name:\t{self.feature_name}\n" \
            #    f"entrophy:\t{self.entrophy}\n" \
            #    f"entrophy_sub:\t{self.entrophy_sub}"
        else:
            return f"result: {self.result}"

# DFS function

In [140]:
def dfs(node: Node):
    if node == None:
        return
    node_left, node_right = node.split()
    dfs(node_left)
    dfs(node_right)

# Cat classification example

| Ear shape(Pointy) | Face shape(Round) | Whickers(Present) | Cat  |
| ----------------- | ----------------- | ----------------- | ---- |
| 1                 | 1                 | 1                 | 1    |
| 0                 | 0                 | 1                 | 1    |
| 0                 | 1                 | 0                 | 0    |
| 1                 | 0                 | 1                 | 0    |
| 1                 | 1                 | 1                 | 1    |
| 1                 | 1                 | 0                 | 1    |
| 0                 | 0                 | 0                 | 0    |
| 1                 | 1                 | 0                 | 1    |
| 0                 | 1                 | 0                 | 0    |
| 0                 | 1                 | 0                 | 0    |



In [141]:
data = np.array([
    [1, 1, 1, 1],
    [0, 0, 1, 1],
    [0, 1, 0, 0],
    [1, 0, 1, 0],
    [1, 1, 1, 1],
    [1, 1, 0, 1],
    [0, 0, 0, 0],
    [1, 1, 0, 1],
    [0, 1, 0, 0],
    [0, 1, 0, 0],
], dtype=int)
feature_name = ['Ear shape', 'Face shape', 'Whiskers']
feature = [1, 1, 1]

In [149]:
root = Node(data, [1, 1, 1])
root.max_depth = 3
root.threshold = 0.05

In [150]:
dfs(root)

In [151]:
print(root, '\n')
print(root.node_left)
print(root.node_right, '\n')
print(root.node_left.node_left)
print(root.node_left.node_right)
print(root.node_right.node_left)
print(root.node_right.node_right)

feature: Ear shape 

feature: Face shape
feature: Whiskers 

result: 1
result: 0
result: 1
result: 0
