Todo List:
1. Handling Missing Values
2. Post-Pruning and Pre-Pruning
3. Prediction Method

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
import math
from math import log 

In [2]:
# Create test data
def create_data():
    datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
    labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
    return datasets, labels

datasets, labels = create_data()
train_data = pd.DataFrame(datasets, columns=labels)
train_data

Unnamed: 0,年龄,有工作,有自己的房子,信贷情况,类别
0,青年,否,否,一般,否
1,青年,否,否,好,否
2,青年,是,否,好,是
3,青年,是,是,一般,是
4,青年,否,否,一般,否
5,中年,否,否,一般,否
6,中年,否,否,好,否
7,中年,是,是,好,是
8,中年,否,是,非常好,是
9,中年,否,是,非常好,是


In [3]:
train_data.有自己的房子 = train_data.有自己的房子.map(lambda x: '有房子' if x == '是' else '没有自己的房子')
train_data.有工作 = train_data.有工作.map(lambda x: '有工作' if x == '是' else '没有工作') 
train_data.rename({'有工作': '工作情况', '有自己的房子': '住房情况'})
train_data['deposit'] = [10, 12, 100, 101, 90, 81, 82, 32, 91, 91, 80, 80, 90, 20, 30]
train_data = train_data.reindex(columns=['年龄', '有工作', '有自己的房子', '信贷情况', 'deposit', '类别'])

### CART Algorithm for Decision Tree Generation

1. Termination Condition:
    1. gini coefficient is smaller than the threshold
    2. sample size is smaller than the threshold
    3. tree depth outnumbers the threshold
    4. Feature list is null
    5. Current datasets only contains one category
2. For each node:
    1. For each feature:
        1. For each feature class (Be carful of handling the continous variable):
        Compute the gini coefficient of the label:
        $$Gini(D, A) = \cfrac{|D_1|}{|D|}Gini(D_1) +  \cfrac{|D_2|}{|D|}Gini(D_2)$$
    Choose the class and the feature with the smallest gini coefficient(meaning the biggest purity) and bisplit the current node
3. Recurse the procedure above, until one of the termination conditions is satisfied

In [4]:
class DTNode:
    # ? How to prune the Decision Tree from bottom to top
    # ? Current thread is to through posterior order thus we need child_node to store all children
    def __init__(self, root=None, label=None, feature_name=None, feature_class=None, name=None):
        """Define DTNode Class

        Args:
            root (bool, optional): Whether this is the root node. Defaults to None.
            label (str, optional): If this is the leaf node, the this denotes the label of this decision path. Defaults to None.
            feature_name (str, optional): Name of the feature of this node. Defaults to None.
            feature_class(str, optional): Class of the feature which is served as the split point of the current node
            name(str, optional): Name of the node, e.g. ">10" "belong to old people"
        """
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature_class = feature_class
        self.name = name
        self.child_node = {} # ! This stores all the child nodes. In order to realize the mapping, the dictionary should be like {'feature_class': child_node}
          
    def predict(self, features):
        
        if self.root:
            return self.label
        return self.child_node[features[self.feature_id]].predict(features) # TODO: This should be changed, since CART algorithm is bisect

In [5]:
class DT:
    # TODO Get the Current Depth of the Decision Tree
    
    def __init__(self, episilon=0.1, alpha=0.1, max_depth=1, min_size=10):
        """Initial DT class

        Args:
            episilon (float, optional): threshold for gini coefficient. Defaults to 0.1.
            alpha (float, optional): hyperparameter for post pruning. Defaults to 0.1.
            max_depth (int, optional): threshold for max tree depth. Defaults to 10.
            min_size (int, optional): threshold for minimum sample size. Defaults to 2.
        """
        self.episilon = episilon
        self.alpha = alpha
        self.max_depth = max_depth
        self.min_size = min_size
        self.depth = 0 # This is used for recording tree depth
        self._tree = {}
    
    def calc_gini(self, train_data):
        
        datasets = train_data.values.tolist()
        label_count = {}
        data_length = len(datasets)
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        return 1 - sum([(val / data_length) ** 2 for val in label_count.values()])
        
    def gini_train(self, train_data):
        
        data_length = len(train_data)
        feature_list = train_data.columns[: -1]
        min_gini_feature_class = (float('inf'), feature_list[0], train_data[feature_list[0]].iloc[0]) # ! (current_min_gini, feature_name, feature_class)
        for feature in feature_list:
            if isinstance(train_data.iloc[0][feature], str): # Note the index grammer here
                # get all classes for this feature
                class_list = train_data[feature].value_counts().index.tolist()
                for feature_class in class_list:
                    # Calculate the gini coefficient of the dataset
                    D1 = train_data[train_data[feature] == feature_class]
                    D2 = train_data[train_data[feature] != feature_class]
                    gini = len(D1) / data_length * self.calc_gini(D1) + len(D2) / data_length * self.calc_gini(D2)
                    if gini <= min_gini_feature_class[0]:
                        min_gini_feature_class = (gini, feature, feature_class)
            if isinstance(train_data.iloc[0][feature], (int, float)):
                # sort the dataset according to the feature
                train_data = train_data.sort_values(by=[feature])
                for i in range(data_length - 1):
                    D1 = train_data.iloc[: i + 1, :]
                    D2 = train_data.iloc[i + 1:, :]
                    gini = len(D1) / data_length * self.calc_gini(D1) + len(D2) / data_length * self.calc_gini(D2)
                    if gini <= min_gini_feature_class[0]:
                        min_gini_feature_class = (gini, feature, (train_data.iloc[i][feature] + train_data.iloc[i + 1][feature]) / 2)
        return min_gini_feature_class
    
    def get_depth(self, TreeNode: DTNode):
        
        if not TreeNode:
            return 0
        if not TreeNode.child_node.values():
            return 1
        max_depth = 0
        for child_node in TreeNode.child_node.values():
            max_depth = max(max_depth, self.get_depth(child_node) + 1)
        return max_depth
    
    def train(self, train_data):
        
        y_train = train_data.iloc[:, -1]
        features = train_data.columns[: -1]
        
        # * Be aware of the order of the termination conditions
        # ! if there is no other feature, meaning the dataset only contains the label
        if len(features) == 0:
            return DTNode(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        # ! if the sample size is smaller than the given threshold
        if len(train_data) <= self.min_size:
            return DTNode(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0]) # ! Note here if train_data is null then y_train is null and will cause errors: Keyerror
        
        # ! if y_train only has one category, terminate
        if len(y_train.value_counts()) == 1:
            return DTNode(root=True, label=y_train.iloc[0])
        
        # ! if the min gini coeffcient is smaller than the threshold, meaning that we reach a very pure dataset
        min_gini, feature_name, feature_class = self.gini_train(train_data)
        if min_gini <= self.episilon:
            return DTNode(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        TreeNode = DTNode(root=False, feature_name=feature_name, feature_class=feature_class)
        
        # ! if the depth of the tree outnumbers the threshold
        if self.depth > self.max_depth:
            return DTNode(root=True, label=y_train.value_counts().sort_values(ascending=False).index[0])
        
        # * Bisect the current train_data
        # TODO Difference between 'float' and float in python
        
        # Every feature_name only choose one optimal feature_class as the split point
        # ! Add constraints to make sure that D1 and D2 are not null
        if isinstance(feature_class, str):
            D1 = train_data[train_data[feature_name] == feature_class].drop([feature_name], axis=1)
            D2 = train_data[train_data[feature_name] != feature_class].drop([feature_name], axis=1)
            if len(D1) != 0:
                TreeNode.child_node[feature_class] = self.train(D1)
                    # Add name to child nodes
                TreeNode.child_node[feature_class].name = TreeNode.feature_name + ' is ' + feature_class
            if len(D2) != 0:
                TreeNode.child_node['other'] = self.train(D2)
                TreeNode.child_node['other'].name = TreeNode.feature_name + ' not ' + feature_class
        
        if isinstance(feature_class, (int, float)):
            D1 = train_data[train_data[feature_name] <= feature_class].drop([feature_name], axis=1)
            D2 = train_data[train_data[feature_name] > feature_class].drop([feature_name], axis=1)
            if len(D1) != 0:
                TreeNode.child_node[str(feature_class)] = self.train(D1) # Note here we convert feature_class to str to make it callable in the dictionary
                TreeNode.child_node[str(feature_class)].name = TreeNode.feature_name + ' 小于等于 ' + str(feature_class)
            if len(D2) != 0:
                TreeNode.child_node['other'] = self.train(D2)
                TreeNode.child_node['other'].name = TreeNode.feature_name + ' 大于 ' + str(feature_class)
         
        return TreeNode
    
    def show_tree(self, TreeNode: DTNode): # ! print all paths of a binary tree (leetcode257)
                
        if not TreeNode:
            return None
        paths = []
        
        def print_path(TreeNode: DTNode, path):
            
            if not TreeNode.child_node:
                path += (TreeNode.name if TreeNode.name else "") + "----->" + (str(TreeNode.label) if TreeNode.label else "")
                paths.append(path)
                return None
            elif not TreeNode.name:
                path += TreeNode.feature_name + "----->"
            elif not TreeNode.root:
                path += TreeNode.name + "----->"
            for child_node in TreeNode.child_node.values():
                print_path(child_node, path)
                
        print_path(TreeNode, "")
        for path in paths:
            print(path)
            print("\n")
            
    def fit(self, train_data):
        
        self._tree = self.train(train_data)
        
        return self.show_tree(self._tree)

In [6]:
dt = DT()
dt.fit(train_data)

有自己的房子----->有自己的房子 is 有房子----->是


有自己的房子----->有自己的房子 not 有房子----->否




In [7]:
data = load_iris()
train_data = pd.concat([pd.DataFrame(data['data'], columns=data['feature_names']), pd.DataFrame(data['target'], columns=['target'])], axis=1)
train_data.target = train_data.target.replace(dict(zip(train_data.target.unique().tolist(), data['target_names'].tolist())))

In [8]:
dt = DT()
result = dt.train(train_data)
dt.fit(train_data)

petal width (cm)----->petal width (cm) 小于等于 0.8----->setosa


petal width (cm)----->petal width (cm) 大于 0.8----->petal length (cm) 小于等于 4.9----->sepal length (cm) 小于等于 4.9----->versicolor


petal width (cm)----->petal width (cm) 大于 0.8----->petal length (cm) 小于等于 4.9----->sepal length (cm) 大于 4.9----->sepal width (cm) 小于等于 2.7----->versicolor


petal width (cm)----->petal width (cm) 大于 0.8----->petal length (cm) 小于等于 4.9----->sepal length (cm) 大于 4.9----->sepal width (cm) 大于 2.7----->versicolor


petal width (cm)----->petal width (cm) 大于 0.8----->petal length (cm) 大于 4.9----->virginica




In [9]:
# Ultimate Test
train_data = pd.read_csv("/Users/qiaoxinwei/Downloads/archive/train.csv")
train_data[['blue', 'dual_sim', 'fc', 'four_g', 'n_cores', 'price_range', 'wifi', 'touch_screen', 'three_g']] = train_data[['blue', 'dual_sim', 'fc', 'four_g', 'n_cores', 'price_range', 'wifi', 'touch_screen', 'three_g']].astype(object)
train_data[['blue', 'dual_sim', 'fc', 'four_g', 'n_cores', 'price_range', 'wifi', 'touch_screen', 'three_g']] = train_data[['blue', 'dual_sim', 'fc', 'four_g', 'n_cores', 'price_range', 'wifi', 'touch_screen', 'three_g']].applymap(str)

In [10]:
dt = DT()
dt.fit(train_data)

n_cores----->n_cores is 1----->fc is 11----->0


n_cores----->n_cores is 1----->fc not 11----->m_dep 小于等于 0.1----->clock_speed 小于等于 1.15----->0


n_cores----->n_cores is 1----->fc not 11----->m_dep 小于等于 0.1----->clock_speed 大于 1.15----->blue is 0----->touch_screen is 1----->1


n_cores----->n_cores is 1----->fc not 11----->m_dep 小于等于 0.1----->clock_speed 大于 1.15----->blue is 0----->touch_screen not 1----->2


n_cores----->n_cores is 1----->fc not 11----->m_dep 小于等于 0.1----->clock_speed 大于 1.15----->blue not 0----->wifi is 0----->0


n_cores----->n_cores is 1----->fc not 11----->m_dep 小于等于 0.1----->clock_speed 大于 1.15----->blue not 0----->wifi not 0----->3


n_cores----->n_cores is 1----->fc not 11----->m_dep 大于 0.1----->clock_speed 小于等于 0.5----->wifi is 1----->touch_screen is 1----->1


n_cores----->n_cores is 1----->fc not 11----->m_dep 大于 0.1----->clock_speed 小于等于 0.5----->wifi is 1----->touch_screen not 1----->2


n_cores----->n_cores is 1----->fc not 11----->m_dep 大于 0.1----->clock