In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log 

In [2]:
# Create test data
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    return datasets, labels

datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns=labels)
train_data

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


### ID3 Algorithms for Decision Tree Generation
Basic Element: **TreeNode** / **Tree**

**TreeNode**:

Property:
1. Root: Whether it is the root of this tree
2. label: If this is the root, then what is the label of this root
3. feature: The id of the feature in this node 
4. feature_name: The name of the feature in this node
5. tree: The tree whose root is this node
Note for example we have "Car type" this *feature name*, and "Tesla", "Apple" are the *features*

Method:
1. add_node: like right_tree / left_tree in binary tree, the number of added nodes depends on the numbers of features in this node
2. predict: predict the label recursively. If the current node is not the root node, then use the offspring node for prediction recursively. And connect to the corresponding offspring node through *add_node* method.


**Tree**:

Property:
1. episilon: The threshold for stopping the recursion
2. self._tree: for print only

Method:
1. train
2. fit
3. predict

In [3]:
class Node:
    
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {
            'label': self.label,
            'feature': self.feature,
            'tree': self.tree
        }
        
    def __repr__(self):
        return '{}'.format(self.result)
    
    def add_node(self, val, node):
        self.tree[val] = node # Note that tree is a tuple
        
    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)

In [4]:
class DTree:
    
    def __init__(self, episilon=0.1):
        self.episilon = episilon # threshold for information gain, basically used for pre-pruning
        self._tree = {}

    @staticmethod
    def calc_ent(datasets): # Calculate entropy
        data_length = len(datasets)
        label_count = {}
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        ent = -sum([(p / data_length) * log(p / data_length, 2)
                    for p in label_count.values()])
        return ent

    def cond_ent(self, datasets, axis=0): # Calculate conditional entropy
        data_length = len(datasets)
        feature_sets = {}
        for i in range(data_length):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])
        cond_ent = sum([(len(p) / data_length) * self.calc_ent(p)
                        for p in feature_sets.values()])
        return cond_ent

    @staticmethod
    def info_gain(ent, cond_ent): # Calculate information gain
        return ent - cond_ent

    def info_gain_train(self, datasets): # Get the maximum information gain and the corresponding feature id
        count = len(datasets[0]) - 1
        ent = self.calc_ent(datasets)
        best_feature = []
        for c in range(count):
            c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c))
            best_feature.append((c, c_info_gain))
        best_ = max(best_feature, key=lambda x: x[-1])
        return best_
    
    def train(self, train_data): # Build the decision tree recursively and greedily (Think about what does greedy mean here?)
        _, y_train, features = train_data.iloc[:, :-1], train_data.iloc[:, -1], train_data.columns[:-1]
        
        # If the datasets only contain one category
        if len(y_train.value_counts()) == 1:
            return Node(root=True, label=y_train.iloc[0])
        
        # If the datasets is null
        if len(features) == 0:
            return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
        max_feature_name = features[max_feature]
        
        # if the max info gain is smaller than the threshold
        if max_info_gain < self.episilon:
            return Node(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        node_tree = Node(root=False, feature_name=max_feature_name, feature=max_feature)
        feature_list = train_data[max_feature_name].value_counts().index
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis=1)
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)
        return node_tree
    
    def fit(self, train_data): # Add the train result to the class property tree
        self._tree = self.train(train_data)
        return self._tree
    
    def predict(self, X_test): # Predict a single sample
        return self._tree.predict(X_test)
        

In [5]:
# train and predict data using self-realizing decision tree building algorithm
dt = DTree()
train_data.有自己的房子 = train_data.有自己的房子.map(lambda x: '有房子' if x == '是' else '没有自己的房子')
train_data.有工作 = train_data.有工作.map(lambda x: '有工作' if x == '是' else '没有工作') 
train_data.rename({'有工作': '工作情况', '有自己的房子': '住房情况'})
tree = dt.fit(train_data)
tree

{'label': None, 'feature': 2, 'tree': {'没有自己的房子': {'label': None, 'feature': 1, 'tree': {'没有工作': {'label': '否', 'feature': None, 'tree': {}}, '有工作': {'label': '是', 'feature': None, 'tree': {}}}}, '有房子': {'label': '是', 'feature': None, 'tree': {}}}}

In [6]:
dt.predict(['老年', '有工作', '有房子', '一般'])

'是'