In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter

import math
from math import log

In [2]:
# 经验熵
def entropy(datasets):
    data_length = len(datasets)
    label_count = {}
    for i in range(data_length):
        label = datasets[i][-1]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    entropy = - sum([(p / data_length) * log(p / data_length, 2)
                    for p in label_count.values()])
    return entropy

# 条件经验熵
def cond_entropy(datasets, axis=0):
    """
    求数据集datasets中第axis列的条件经验熵
    """
    data_length = len(datasets)
    feature_sets = {}
    for i in range(data_length):
        feature = datasets[i][axis]
        if feature not in feature_sets:
            feature_sets[feature] = []
        feature_sets[feature].append(datasets[i])
    cond_entropy = sum([(len(p) / data_length) * entropy(p)
                        for p in feature_sets.values()])
    return cond_entropy

# 信息增益 = 经验熵 - 条件经验熵
def info_gain(entropy, cond_entropy):
    return entropy - cond_entropy

# 利用信息增益选择根节点
def info_gain_train(datasets):
    data_dim = len(datasets[0]) - 1
    ent = entropy(datasets)
    info_gain_feature = []
    for i in range(data_dim):
        i_info_gain = info_gain(ent, cond_entropy(datasets, axis=i))
        info_gain_feature.append(i_info_gain)
        print('特征{}的信息增益为：{}'.format(i + 1, i_info_gain))
    best_feature = max(info_gain_feature)
    return '特征{}的信息增益最大，选择根节点特征'.format(info_gain_feature.index(best_feature) + 1)

In [3]:
datasets = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否'],
               ]
labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
train_data = pd.DataFrame(datasets, columns=labels)

In [4]:
info_gain_train(np.array(datasets))

特征1的信息增益为：0.08300749985576883
特征2的信息增益为：0.32365019815155627
特征3的信息增益为：0.4199730940219749
特征4的信息增益为：0.36298956253708536


'特征3的信息增益最大，选择根节点特征'

## ID3算法

In [5]:
# 定义二叉树
# 最后生成的决策树类似这种，{'声音': {'粗': {'头发': {'长': '女', '短': '男'}}, '细': '女'}}
class Node:
    def __init__(self, root = True, label = None, feature_name = None, feature = None):
        self.root = root
        self.label = label
        self.feature_name = feature_name
        self.feature = feature 
        self.tree = {}
        self.result = {
            'label': self.label, 
            'feature_name': self.feature_name,
            'tree': self.tree
        }
    
    def __repr__(self):  # __repr__魔法函数 返回一个对象的描述信息，可以直接调用对象，不用通过print(object)
        return '{}'.format(self.result)
    
    def add_node(self, val, node):
        self.tree[val] = node
    
    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)
        
# 生成决策树
class DTree:
    def __init__(self, epsilon=0.1):  # 信息增益的阈值
        self.epsilon = epsilon
        self._tree = {}
        
    # 定义熵
    @staticmethod
    def calc_entropy(datasets):
        """
        dataset: np.array(), 不含列名，包含标签列
        """
        data_length = len(datasets)
        label_count = {}
        for i in range(data_length):
            label = datasets[i][-1]
            if label not in label_count:
                label_count[label] = 0
            label_count[label] += 1
        entropy = - sum([(p / data_length) * log(p / data_length, 2)
                        for p in label_count.values()])
        return entropy
        
    # 定义经验条件熵
    def cond_entropy(self, datasets, axis = 0):
        """
        dataset: np.array(), 不含列名，包含标签列
        """
        data_length = len(datasets)
        feature_sets = {}
        for i in range(data_length):
            feature = datasets[i][axis]
            if feature not in feature_sets:
                feature_sets[feature] = []
            feature_sets[feature].append(datasets[i])
        cond_entropy = sum([(len(p) / data_length) * self.calc_entropy(p)
                            for p in feature_sets.values()])
        return cond_entropy
    
    # 定义信息增益
    @staticmethod
    def info_gain(entropy, cond_entropy):
        return entropy - cond_entropy
    
    
    def info_gain_train(self, datasets):
        """
        dataset: np.array(), 不含列名，包含标签列
        """
        data_dim = len(datasets[0]) - 1
        entropy = self.calc_entropy(datasets)
        best_feature = []
        for dim in range(data_dim):
            dim_info_gain = self.info_gain(entropy, self.cond_entropy(datasets, axis = dim))
            best_feature.append((dim, dim_info_gain))
        
        best_ = max(best_feature, key=lambda x: x[-1]) # 返回(1, 有工作)
        return best_
    
    def train(self, tain_data):
        """
        input: 数据集D(DataFrame)
        output: 决策树 eg. {'声音': {'粗': {'头发': {'长': '女', '短': '男'}}, '细': '女'}}
        """
        _, y_train, features = train_data.iloc[:,:-1], train_data.iloc[:, -1], train_data.columns[:-1]
        
        # 1. 若D中所有实例都是同一类C_k，则T为单节点，并将C_k作为结点的类标记，返回T
        if len(y_train.value_counts()) == 1:
            return Node(root = True, label = y_train.iloc[0])
        
        # 2. 若A为空，则T为单节点树，并将D中实例数最大的类标记C_k为该结点的类，返回T
        if len(features) == 0:
            return Node(root = True, label = y_train.value_counts().sort_values(ascending=False).index[0]) # value_counts中的ascending=Flase，降序排列
        
        # 3. 计算A中对D的信息增益，选择最大信息增益的Ag
        max_feature, max_info_gain = self.info_gain_train(np.array(train_data))
        max_feature_name = features[max_feature]
        
        # 4. Ag的信息增益小于阈值epsilon，则设置T为单节点，并将D中的实例数最大的C_k作为该节点的标记，返回T
        if max_info_gain < self.epsilon:
            return Node(root = True, label = y_train.value_counts().index[0])
        
        # 5. 否则，对Ag中的每一个a_i，依据Ag=a_i将D分割为若干非空子集D_i，并将D_i中实例数最大的C_k作为该节点的标记，构建子集
        node_tree = Node(root=False, feature_name = max_feature_name, feature = max_feature)
        feature_list = train_data[max_feature_name].value_counts().index
        
        for f in feature_list:
            sub_train_df = train_data.loc[train_data[max_feature_name] == f].drop([max_feature_name], axis = 1)
            # 6. 递归的生成树
            sub_tree = self.train(sub_train_df)
            node_tree.add_node(f, sub_tree)
        
        return node_tree
    
    def fit(self, train_data):
        self._tree = self.train(train_data)
        return self._tree # self._tree 就是node
    
    def predict(sel, X_test):
        """
        X_test：输入的单个实例
        """
        return self._tree.predict(X_test)
 

In [6]:
data_df = pd.DataFrame(datasets, columns=labels)
dt = DTree()
tree = dt.fit(data_df)
###我找不到无穷递归的原因。。。。。。

RecursionError: maximum recursion depth exceeded

## scikit-learn

DecisionTreeClassifier(criterion="gini",
                 splitter="best",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_features=None,
                 random_state=None,
                 max_leaf_nodes=None,
                 min_impurity_decrease=0.,
                 min_impurity_split=None,
                 class_weight=None,
                 presort=False)

参数含义：

1.**criterion**:string, optional (default="gini")
(1).criterion='gini',分裂节点时评价准则是Gini指数。
(2).criterion='entropy',分裂节点时的评价指标是信息增益。
            
2.**max_depth**:int or None, optional (default=None)。指定树的最大深度。
    如果为None，表示树的深度不限。直到所有的叶子节点都是纯净的，即叶子节点中所有的样本点都属于同一个类别。或者每个叶子节点包含的样本数小于min_samples_split。

3.**splitter**:string, optional (default="best")。指定分裂节点时的策略。
    (1).splitter='best',表示选择最优的分裂策略。适用数据量小
    (2).splitter='random',表示选择最好的随机切分策略。适用数据量大

4.min_samples_split:int, float, optional (default=2)。表示分裂一个内部节点需要的做少样本数。
    (1).如果为整数，则min_samples_split就是最少样本数。
    (2).如果为浮点数(0到1之间)，则每次分裂最少样本数为ceil(min_samples_split * n_samples)

5.min_samples_leaf: int, float, optional (default=1)。指定每个叶子节点需要的最少样本数。
    (1).如果为整数，则min_samples_split就是最少样本数。
    (2).如果为浮点数(0到1之间)，则每个叶子节点最少样本数为ceil(min_samples_leaf * n_samples)

6.min_weight_fraction_leaf:float, optional (default=0.) 指定叶子节点中样本的最小权重。

7.max_features:int, float, string or None, optional (default=None).
    搜寻最佳划分的时候考虑的特征数量。
    (1).如果为整数，每次分裂只考虑max_features个特征。
    (2).如果为浮点数(0到1之间)，每次切分只考虑int(max_features * n_features)个特征。
    (3).如果为'auto'或者'sqrt',则每次切分只考虑sqrt(n_features)个特征
    (4).如果为'log2',则每次切分只考虑log2(n_features)个特征。
    (5).如果为None,则每次切分考虑n_features个特征。
    (6).如果已经考虑了max_features个特征，但还是没有找到一个有效的切分，那么还会继续寻找下一个特征，直到找到一个有效的切分为止。

8.random_state:int, RandomState instance or None, optional (default=None)
    (1).如果为整数，则它指定了随机数生成器的种子。
    (2).如果为RandomState实例，则指定了随机数生成器。
    (3).如果为None，则使用默认的随机数生成器。

9.max_leaf_nodes: int or None, optional (default=None)。指定了叶子节点的最大数量。
    (1).如果为None,叶子节点数量不限。
    (2).如果为整数，则max_depth被忽略。

10.min_impurity_decrease:float, optional (default=0.)
    如果节点的分裂导致不纯度的减少(分裂后样本比分裂前更加纯净)大于或等于min_impurity_decrease，则分裂该节点。
    加权不纯度的减少量计算公式为：
    min_impurity_decrease=N_t / N * (impurity - N_t_R / N_t * right_impurity
                    - N_t_L / N_t * left_impurity)
    其中N是样本的总数，N_t是当前节点的样本数，N_t_L是分裂后左子节点的样本数，
    N_t_R是分裂后右子节点的样本数。impurity指当前节点的基尼指数，right_impurity指分裂后右子节点的基尼指数。left_impurity指分裂后左子节点的基尼指数。

11.min_impurity_split:float
    树生长过程中早停止的阈值。如果当前节点的不纯度高于阈值，节点将分裂，否则它是叶子节点。
    这个参数已经被弃用。用min_impurity_decrease代替了min_impurity_split。

12.class_weight:dict, list of dicts, "balanced" or None, default=None
    类别权重的形式为{class_label: weight}
    (1).如果没有给出每个类别的权重，则每个类别的权重都为1。
    (2).如果class_weight='balanced'，则分类的权重与样本中每个类别出现的频率成反比。
    计算公式为：n_samples / (n_classes * np.bincount(y))
    (3).如果sample_weight提供了样本权重(由fit方法提供)，则这些权重都会乘以sample_weight。

13.presort:bool, optional (default=False)
    指定是否需要提前排序数据从而加速训练中寻找最优切分的过程。设置为True时，对于大数据集
    会减慢总体的训练过程；但是对于一个小数据集或者设定了最大深度的情况下，会加速训练过程。

In [25]:
## sklearn里并没有实现后剪枝，只有预剪枝操作，例如设置树的最大深度max_depth

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = [
    'sepal length', 'sepal width', 'petal lenght', 'petal width', 'label'
]
def shuffle(X,Y):
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return X[randomize], Y[randomize]

data = np.array(df)
X, Y = data[:, :2], data[:,-1]
X, y = shuffle(X,Y)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3)

In [26]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import graphviz

In [27]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)

DecisionTreeClassifier()

In [28]:
clf.score(X_test,Y_test)

0.3111111111111111

In [29]:
tree_pic = export_graphviz(clf, out_file = 'mytree.pdf')
with open('mytree.pdf') as f:
    dot_graph = f.read()

In [30]:
graphviz.Source(dot_graph)

ExecutableNotFound: failed to execute ['dot', '-Kdot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

<graphviz.files.Source at 0x7fb06d516d68>

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
import numpy as np 
import pandas as pd

from sklearn import tree
import graphviz

features = ["年龄", "有工作", "有自己的房子", "信贷情况"]
X_train = pd.DataFrame([
    ["青年", "否", "否", "一般"],
    ["青年", "否", "否", "好"],
    ["青年", "是", "否", "好"],
    ["青年", "是", "是", "一般"],
    ["青年", "否", "否", "一般"],
    ["中年", "否", "否", "一般"],
    ["中年", "否", "否", "好"],
    ["中年", "是", "是", "好"],
    ["中年", "否", "是", "非常好"],
    ["中年", "否", "是", "非常好"],
    ["老年", "否", "是", "非常好"],
    ["老年", "否", "是", "好"],
    ["老年", "是", "否", "好"],
    ["老年", "是", "否", "非常好"],
    ["老年", "否", "否", "一般"]
])
y_train = pd.DataFrame(["否", "否", "是", "是", "否", 
                        "否", "否", "是", "是", "是", 
                        "是", "是", "是", "是", "否"])

#数据预处理
le_x = preprocessing.LabelEncoder()
le_x.fit(np.unique(X_train)) # 找出X_tran中的特征值有哪些
# array(['一般', '中年', '否', '好', '是', '老年', '青年', '非常好'], dtype=object)
# 这样 0代表‘一般’，1代表‘中年’，....
X_train = X_train.apply(le_x.transform) # apply()可以传入一个函数
le_y = preprocessing.LabelEncoder()
le_y.fit(np.unique(y_train))
y_train = y_train.apply(le_y.transform)

# 调用sklearn.DT训练模型
model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)


DecisionTreeClassifier()