In [27]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [92]:
def entropy(y):
    """
    calulate entopy, entropy = Dv/D * log2(Dv/D)
    """
    unique_y = y.unique()
    entropy_sum = 0

    for cls in unique_y:
        entropy_sum -= len(y[y==cls]) / len(y) * np.log2(len(y[y==cls]) / len(y))
    return entropy_sum

def gain_ratio(X, y):
    gain2 = entropy(y)
    X_classifier = X.unique()
    for cls in X_classifier:
        gain2 -= len(y[X==cls]) * 1.0/len(y) * entropy(y[X == cls])
    return gain2

def iv(X):
    value_counts = X.value_counts()
    x_size = len(X)
    iv = 0
    for val in value_counts:
        iv -= val * 1.0 / x_size * np.log2(val * 1.0 / x_size)

    return iv

def gini(X):
    return

class DecisionTreeNode(object):
    def __init__(self, column='', children={}, result=None):
        self.column = column
        self.children = children
        self.result = result

    def predict(self, X):
        if self.column == '':
            return self.result

        if len(self.children.keys()) > 0 & self.children.keys().__contains__(X[self.column]):
            return self.children[X[self.column]].predict(X)
        elif len(self.children.keys()) == 0:
            return -1
        return self.result


class DecisionTree(object):
    def __init__(self, information_method='entropy'):
        self.information_method = information_method
        self.tree = DecisionTreeNode('')

    def fit(self, X, y):
        self.tree = self._build_children(X, y)

    def _build_children(self, X, y):
        columns = X.columns
        root = DecisionTreeNode('')
        stack = [(X.index, root)]

        while len(stack) > 0:
            values = stack.pop()
            cur_index = values[0]
            cur_node = values[1]

            # if y only have one value, make it as node and break
            cur_y = y.iloc[cur_index]
            y_unique = cur_y.unique()
            if len(y_unique) == 1:
                cur_node.result = y_unique
                continue

            # search for max gain
            cur_x = X.iloc[cur_index]
            max_gain = -1
            for col in columns:
                col_gain = gain_ratio(cur_x[col], cur_y)
                if col_gain > max_gain:
                    max_gain = col_gain
                    cur_node.column = col

            # generate nodes
            column_values = cur_x[cur_node.column].unique()
            for column_value in column_values:
                child = DecisionTreeNode(column='', children={}, result=None)
                cur_node.children[column_value] = child
                stack.append((cur_x[cur_x[cur_node.column]==column_value].index.values, child))

        return root

    def predict(self, X):
        self.tree.predict(X)

In [29]:
data = pd.DataFrame([[1,1,1,1,1,1,1], [2,1,2,1,1,1,1], [2,1,1,1,1,1,1], [1,1,2,1,1,1,1], [3,1,1,1,1,1,1],
        [1,2,1,1,2,2,1], [2,2,1,2,2,2,1], [2,2,1,1,2,1,1], [2,2,2,2,2,1,0], [1,3,3,1,3,2,0],
        [3,3,3,3,3,1,0], [3,1,1,3,3,2,0], [1,2,1,2,1,1,0], [3,2,2,2,1,1,0], [2,2,1,1,2,2,0],
        [3,1,1,3,3,1,0], [1,1,2,2,2,1,0]], columns=['色泽','根蒂','敲声','纹理','脐部','触感','好瓜'])
X = data[data.columns[:-1]]
y = data['好瓜']

In [85]:
decision_tree = DecisionTree()

decision_tree.fit(X, y)

In [93]:
for col in X.columns:
    print(col , ": ", iv(X[col]))

色泽 :  1.5798634010685344
根蒂 :  1.402081402756032
敲声 :  1.3328204045850196
纹理 :  1.4466479595102752
脐部 :  1.548565226030918
触感 :  0.8739810481273578
