Todo List:
1. Handling Missing Values
2. Post-Pruning & Pre-Pruning

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log 

In [8]:
# Create test data
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    return datasets, labels

datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns=labels)
train_data

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


### C4.5 Algorithm for Decision Tree Generation

Basic difference between C4.5 and ID3:
1. Using information gain ratio rather than information gain 
2. Handling training data with missing values
3. Handling training data with continous values
4. Post-prune trees after creation

In [9]:
# Define TreeNode
class DTNode:
    
    def __init__(self, root=False, label=None, feature_name=None, feature=None):
        self.root = root # whether a rootnode or not
        self.label = label # if this is the root node, then the final label(y_train)
        self.feature = feature # feature id
        self.feature_name = feature_name # feature name
        self.tree = {} # structure of the tree
        self.result = {
            'label': self.label,
            'feature': self.feature,
            'tree': self.tree
        }

    def __repr__(self):
        
        """ make the class object callable
        
        show the structure of the node
        
        """
        return '{}'.format(self.result) 
    
    def add_node(self, val, node):
        
        """ Add connections between the root node and its offspring

        Args:
            val (str): The class of the feature e.g: "car type": ["Tesla", "Apple"] feature -> car type val -> Tesla
            node (DTNode): The node represent the subtree e.g: the structure of the root node after selecting "Tesla" in feature "car type"
        """
        self.tree[val] =  node
    
    def predict(self, features):
        """ predict the result for a single sample

        Args:
            features (list or array): the sample

        Understand the process of conducting this prediction:
            First we judge whether this is the root node. If it is, the we output its label. If not, we firstly detect the feature id of this node, since
            this node uses the i-th feature to make decision and in the end we move to the corresponding subtree.
        """
        if self.root:
            return self.label
        return self.tree[features[self.feature]].predict(features)       

In [10]:
# Define DT
class DT:
    
    def __init__(self, episilon=0.1): # episilon defines the threshold
        self.episilon = episilon
        self._tree = {}
        
    def calc_ent(self, datasets):
        data_length = len(datasets)
        data_label = {}
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in data_label:
                data_label[label] = 0
            data_label[label] += 1
        return -sum([val / data_length * math.log(val / data_length, 2) for val in data_label.values()])
    
    def cond_ent(self, datasets, axis=0):
        data_length = len(datasets)
        feature_list = {}
        for i in range(data_length):
            feature_class = datasets[i][axis]
            if feature_class not in feature_list:
                feature_list[feature_class] = []
            feature_list[feature_class].append(datasets[i][axis])
        return sum([len(subdata) / data_length * self.calc_ent(subdata) for subdata in feature_list.values()])
    
    def info_gain_ratio(self, datasets, axis=0):
        return (self.calc_ent(datasets) - self.cond_ent(datasets, axis)) / self.calc_ent(datasets)
    
    def info_gain_train(self, datasets):
        max_info_gain_ratio = (0, 0) # (feature[id], info_gain_ratio)
        for i in range(len(datasets[0]) - 1):
            info_gain_r = self.info_gain_ratio(datasets, i)
            if info_gain_r >= max_info_gain_ratio[1]:
                max_info_gain_ratio = (i, info_gain_r)
        return max_info_gain_ratio
    
    def train(self, train_data):
        _, y_train, features = train_data.iloc[:, : -1], train_data.iloc[:, -1], train_data.columns[: -1]
        
        #! if there is only one category
        if len(y_train.value_counts()) == 1:
            return DTNode(root=True, label=y_train.iloc[0])        

        #! if the dataset is null, then the most frequent labels should be the result
        if len(features) == 0:
            return DTNode(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        #! If the max_info_ratio doesn't outnumber the threshold
        max_feature_id, max_info_gain_ratio = self.info_gain_train(np.array(train_data))
        max_feature_name = features[max_feature_id]
        
        if max_info_gain_ratio < self.episilon:
            return DTNode(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        # If none of the situations above happens, then we establish the current DTNode with property root = False and determine the feature
        RootNode = DTNode(root=False, feature_name=max_feature_name, feature=max_feature_id)
        feature_val_list = train_data.iloc[:, max_feature_id].value_counts().index
        for val in feature_val_list:
            subtree_df = train_data.loc[train_data[max_feature_name] == val].drop(max_feature_name, axis=1)
            subtree = self.train(subtree_df)
            RootNode.add_node(val, subtree)
            
        return RootNode
    
    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree
        
    def predict(self, X_test):
        return self._tree.predict(X_test)
    
    def score(self, X_test, y_test):
        right = 0
        for i in range(len(X_test)):
            if y_test.iloc[i] == self.predict(X_test.iloc[i, :].tolist()):
                right += 1
        return right / len(X_test)

In [11]:
train_data.有自己的房子 = train_data.有自己的房子.map(lambda x: '有房子' if x == '是' else '没有自己的房子')
train_data.有工作 = train_data.有工作.map(lambda x: '有工作' if x == '是' else '没有工作') 
train_data.rename({'有工作': '工作情况', '有自己的房子': '住房情况'})

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,没有工作,没有自己的房子,一般,否
1,青年,没有工作,没有自己的房子,好,否
2,青年,有工作,没有自己的房子,好,是
3,青年,有工作,有房子,一般,是
4,青年,没有工作,没有自己的房子,一般,否
5,中年,没有工作,没有自己的房子,一般,否
6,中年,没有工作,没有自己的房子,好,否
7,中年,有工作,有房子,好,是
8,中年,没有工作,有房子,非常好,是
9,中年,没有工作,有房子,非常好,是


In [12]:
dt = DT()
tree = dt.fit(train_data)
tree.predict(['老年', '有工作', '有房子', '一般'])

'是'