In [62]:
import numpy as np
import math
from abc import abstractmethod
import operator

# 定义节点类
class DecisionTreeNode:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature
        self.tree = {}
        self.result = {
            'label:': self.label,
            'feature': self.feature,
            'tree': self.tree
        }

class BaseDecisionTree:
    '''
    @abstractmethod
    def __init__(self,
                 criterion,
                 splitter,
                 max_depth,
                 min_samples_split,
                 min_samples_leaf,
                 min_weight_fraction_leaf,
                 max_features,
                 max_leaf_nodes,
                 random_state,
                 min_impurity_decrease,
                 min_impurity_split,
                 class_weight=None,
                 presort=False):
        self.criterion = criterion
        self.splitter = splitter
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_impurity_split = min_impurity_split
        self.class_weight = class_weight
        self.presort = presort
'''
    @staticmethod
    def entropy(data):
        '''
        输入数据data,输出其经验熵'''
        n=len(data)   #数据个数
        label_dict={}
        for i in range(n):
            label_dict[data[i][-1]]=label_dict.get(data[i][-1],0)+1
        k=len(label_dict)  #类别个数
        ent=0
        for n_k in label_dict.values():
            ent+= n_k/n * math.log(n_k/n,2)
        return -ent
    
    @staticmethod
    def conditional_entropy(data,a):
        '''
        输入数据data和用来分类的特征a(即数据的第a列),输出条件熵'''
        n=len(data)   #数据个数
        con_ent=0
        new_data=BaseDecisionTree.data_divide(data,a)
        for curr_data in new_data:
            con_ent+= len(curr_data)/n * BaseDecisionTree.entropy(curr_data)        
        return con_ent
    
    @staticmethod
    def data_divide(data,a):
        '''
        根据第a列特征将数据划分'''
        new_data={}
        for curr_data in data:
            new_data[curr_data[a]]=new_data.get(curr_data[a],[])
            new_data[curr_data[a]].append(curr_data)
        return list(new_data.values())
    
    @staticmethod
    def most_class(data):
        '''
        返回数据集中实例数最多的类'''
        n=len(data)   #数据个数
        label_dict={}
        for i in range(n):
            label_dict[data[i][-1]]=label_dict.get(data[i][-1],0)+1
        m=0
        for key in label_dict.keys():
            if label_dict[key]>m:
                m=label_dict[key]
                res=key
        return res
    
class ID3(BaseDecisionTree):
    def __init__(self,epsilon=1e-3):
        self.epsilon=epsilon
        
    def fit(self,data):
        def dfs(new_data,feature_list):  #递归创建树
            if len(set([x[-1] for x in new_data]))==1:  #当前数据集所有实例都属于同一类
                new_node=DecisionTreeNode()
                new_node.label=ID3.most_class(new_data)
                return new_node.result
            print("feature_list",feature_list)
            best_feature_index,information_gain=self.__chooseBestFeature(new_data)  #选取最优的特征
            best_feature=feature_list[best_feature_index]
            print("best_feature,information_gain",best_feature,information_gain)
            if information_gain<self.epsilon:   #当信息增益小于阈值epsilon时停止
                return  None
            
            new_node=DecisionTreeNode()
            new_node.feature=best_feature
            new_node.label=ID3.most_class(new_data)
            
            next_data_list=ID3.data_divide(new_data,best_feature)  #用最优的特征划分当前数据集
            for next_data in next_data_list:  #对划分后的每个新数据集递归创建树
                feature_value=next_data[0][best_feature]  #最优特征在当前数据集中的取值
                if len(feature_list)>1:
                    new_node.tree[feature_value]=dfs([x[:best_feature_index]+x[best_feature_index+1:] for x in next_data],
                                                                   feature_list[:best_feature_index]+feature_list[best_feature_index+1:])
                else:new_node.tree[feature_value]=DecisionTreeNode(label=ID3.most_class(next_data))
            return new_node.result
        return dfs(data,list(range(len(data[0])-1)))
    
    def __chooseBestFeature(self,data):#选取最优的特征
        
        ent=ID3.entropy(data)  #数据集的经验熵
        n_features=len(data[0])-1  #特征个数
        information_gain_list=[]  #每个特征对数据集的信息增益
        for i in range(n_features):
            information_gain_list.append(ent-ID3.conditional_entropy(data,i))
        #获取最大的信息增益对应的特征索引及信息增益值
        print("ent,information_gain_list",ent,information_gain_list)
        min_index, min_number = max(enumerate(information_gain_list), key=operator.itemgetter(1))  
        return min_index, min_number

In [63]:
BaseDecisionTree.conditional_entropy(datasets,0)

0.8879430945988998

In [64]:
a=ID3()
b=a.fit(datasets)
b

feature_list [0, 1, 2, 3]
ent,information_gain_list 0.9709505944546686 [0.08300749985576883, 0.32365019815155627, 0.4199730940219749, 0.36298956253708536]
best_feature,information_gain 2 0.4199730940219749
feature_list [0, 1, 3]
ent,information_gain_list 0.9182958340544896 [0.2516291673878229, 0.9182958340544896, 0.47385138961004514]
best_feature,information_gain 1 0.9182958340544896


{'label:': None,
 'feature': None,
 'tree': {'否': {'label:': None,
   'feature': None,
   'tree': {'否': {'label:': None, 'feature': None, 'tree': {}},
    '是': {'label:': None, 'feature': None, 'tree': {}}}},
  '是': {'label:': None, 'feature': None, 'tree': {}}}}

In [None]:
{'label:': None, 'feature': 2, 'tree': {'否': {'label:': None, 'feature': 1, 'tree': {'否': {'label:': '否', 'feature': None, 
'tree': {}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}, '是': {'label:': '是', 'feature': None, 'tree': {}}}}

In [10]:
a=[[1,2,3],[2,2,2]]
[x[:0]+x[1:] for x in a]

[[2, 3], [2, 2]]

In [19]:
def f():
    return 2,3
f()[1]

3

In [25]:
datasets = [['青年', '否', '否', '一般', '否否'],
           ['青年', '否', '否', '好', '否否'],
           ['青年', '是', '否', '好', '是是'],
           ['青年', '是', '是', '一般', '是是'],
           ['青年', '否', '否', '一般', '否否'],
           ['中年', '否', '否', '一般', '否否'],
           ['中年', '否', '否', '好', '否否'],
           ['中年', '是', '是', '好', '是是'],
           ['中年', '否', '是', '非常好', '是是'],
           ['中年', '否', '是', '非常好', '是是'],
           ['老年', '否', '是', '非常好', '是是'],
           ['老年', '否', '是', '好', '是是'],
           ['老年', '是', '否', '好', '是是'],
           ['老年', '是', '否', '非常好', '是是'],
           ['老年', '否', '否', '一般', '否否'],
           ]
labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']

In [35]:
a=[]
b=[1,2,3]
a.append(b)

In [36]:
a

[[1, 2, 3]]