In [1]:
'''
特征选择算法
'''

import numpy as np
import math
'''
熵的计算
'''
def entropy(y_values):
    e = 0
    unique_vals = np.unique(y_values)
    for val in unique_vals:
        p = np.sum(y_values == val)/len(y_values)
        e += (p * math.log(p, 2))
    return -1 * e

'''
条件熵的计算
'''
def entropy_condition(x_values, y_values):
    ey = entropy(y_values)
    ey_condition = 0
    xy = np.hstack((x_values, y_values))
    unique_x = np.unique(x_values)
    for x_val in unique_x:
        px = np.sum(x_values == x_val) / len(x_values)
        xy_condition_x = xy[np.where(xy[:, 0] == x_val)]
        ey_condition_x = entropy(xy_condition_x[:, 1])
        ey_condition += (px * ey_condition_x)
    return ey - ey_condition

'''
信息增益比：摒弃了选择取值多的特征为重要特征的缺点
'''
def entropy_condition_ratio(x_values, y_values):
    return entropy_condition(x_values, y_values) / entropy(x_values)

'''
基尼指数计算
'''
def gini(y_values):
    g = 0
    unique_vals = np.unique(y_values)
    for val in unique_vals:
        p = np.sum(y_values == val)/len(y_values)
        g += (p * p)
    return 1 - g

'''
按照x取值的基尼指数的计算
'''
def gini_condition(x_values, y_values):
    g_condition = {}
    xy = np.hstack((x_values, y_values))
    unique_x = np.unique(x_values)
    for x_val in unique_x:
        xy_condition_x = xy[np.where(xy[:, 0] == x_val)]
        xy_condition_notx = xy[np.where(xy[:, 0] != x_val)]
        g_condition[x_val] = len(xy_condition_x)/len(x_values) * gini(xy_condition_x[:, 1]) + len(xy_condition_notx)/len(x_values) * gini(xy_condition_notx[:, 1])
    return g_condition



In [2]:
class DTNode:
    '''
    决策树节点
    '''
    def __init__(self, x, y, default_label, split_val = None):
        self.children = []
        if(len(y) != 0):
            self.label = Counter(y.reshape(1, -1).tolist()[0]).most_common(1)[0][0]
        else:
            self.label = default_label
        self.next_split_index = None
        self.split_val = split_val
        self.x = x.copy()
        self.y = y.copy()
        self.xy = np.hstack([x, y])
        self.default_label = default_label
    def get_x(self):
        return self.x
    def get_y(self):
        return self.y
    def get_xy(self):
        return self.xy
    def get_children(self):
        return self.children
    def get_label(self):
        return self.label
    def get_next_split_index(self):
        return self.next_split_index
    def get_split_val(self):
        return self.split_val
    def _get_x_and_xval(self, calculate_method, threshold):
        '''
        根据所选方法及阈值，计算信息增益（比）,选择目标特征, 并计算目标特征取值种类
        '''
        res = {}
        for col_index in range(self.x.shape[1]):
            res[col_index] = calculate_method(self.x[:, col_index].reshape(-1, 1), self.y.reshape(-1, 1))
        target = sorted(res, key=res.__getitem__, reverse=True)[0]
        if(res[target] < threshold):
            return None, None
        else:
            return target, np.unique(self.x[:, target])
    def build_children(self, method, threshold):
        '''
        检测退出条件
        '''
        if(len(np.unique(self.y)) == 1):
            self.label = np.unique(self.y)[0]
            return
        elif(self.x == []):
            self.label = self.default_label
            return
        '''
        构建子节点
        '''
        if(method == 'information gain'):
            x_index, x_val = self._get_x_and_xval(entropy_condition, threshold)
        else:
            #method == 'information gain ratio'
            x_index, x_val = self._get_x_and_xval(entropy_condition_ratio, threshold)
        '''
        无需分割
        label置为当前最多的label值
        ？
        '''
        if(x_index == None):
            #self.label = self.default_label
            return
        self.next_split_index = x_index
        for val in x_val:
            splited_xy = self.xy[self.xy[:, x_index] == val]
            splited_xy = np.delete(splited_xy, [x_index], axis = 1)
            self.children.append(DTNode(splited_xy[:, :-1], splited_xy[:, -1].reshape(-1, 1), self.default_label, val))


In [3]:
from collections import Counter
class DecisionTree:
    '''
    决策树
    '''
    def __init__(self, method, threshold):
        self.x = None
        self.y = None
        self.root = None
        self.threshold = threshold
        self.default_label = None
        self.method = method
        if(method == 'ID3'):
            self.feature_selection_method = "information gain"
        else:
            #method == 'C4.5'
            self.feature_selection_method = "information gain ratio"
    def fit(self, x, y):
        self.x = x
        self.y = y
        '''
        筛选默认label，即训练集中频率最高的label
        '''
        self.default_label = Counter(self.y.reshape(1, -1).tolist()[0]).most_common(1)[0][0]
        '''
        宽度遍历建立决策树
        '''
        self.root = DTNode(x, y, self.default_label)
        queue = [self.root]
        while(len(queue) > 0):
            node = queue.pop(0)
            node.build_children(self.feature_selection_method, self.threshold)
            queue += node.get_children()
    def show(self):
        '''
        展示各个节点的信息
        '''
        queue = [self.root]
        while(len(queue) > 0):
            node = queue.pop(0)
            print('==============')
            print('node label:', node.get_label())
            print('node split_val', node.get_split_val())
            print('node next_split_index:', node.get_next_split_index())
            print('xy:')
            print(node.get_xy())
            queue += node.get_children()
        

In [4]:
xy = np.array([[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2], [0,0,1,1,0,0,0,1,0,0,0,0,1,1,0], [0,0,0,1,0,0,0,1,1,1,1,1,0,0,0], 
             [0,1,1,0,0,0,1,1,2,2,2,1,1,2,0], [0,0,1,1,0,0,0,1,1,1,1,1,1,1,0]]).T
dt = DecisionTree(method = 'ID3', threshold = 0.1)
dt.fit(xy[:, :-1], xy[:, -1].reshape(-1, 1))
dt.show()

node label: 1
node split_val None
node next_split_index: 2
xy:
[[0 0 0 0 0]
 [0 0 0 1 0]
 [0 1 0 1 1]
 [0 1 1 0 1]
 [0 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 1 0]
 [1 1 1 1 1]
 [1 0 1 2 1]
 [1 0 1 2 1]
 [2 0 1 2 1]
 [2 0 1 1 1]
 [2 1 0 1 1]
 [2 1 0 2 1]
 [2 0 0 0 0]]
node label: 0
node split_val 0
node next_split_index: 1
xy:
[[0 0 0 0]
 [0 0 1 0]
 [0 1 1 1]
 [0 0 0 0]
 [1 0 0 0]
 [1 0 1 0]
 [2 1 1 1]
 [2 1 2 1]
 [2 0 0 0]]
node label: 1
node split_val 1
node next_split_index: None
xy:
[[0 1 0 1]
 [1 1 1 1]
 [1 0 2 1]
 [1 0 2 1]
 [2 0 2 1]
 [2 0 1 1]]
node label: 0
node split_val 0
node next_split_index: None
xy:
[[0 0 0]
 [0 1 0]
 [0 0 0]
 [1 0 0]
 [1 1 0]
 [2 0 0]]
node label: 1
node split_val 1
node next_split_index: None
xy:
[[0 1 1]
 [2 1 1]
 [2 2 1]]


