In [1]:
import numpy as np

与Cart相比，C4.5有以下区别：

    划分时的特征选择：进行特征选择时使用信息增益率而不是gini
    划分的分支：C4.5是多叉树而Cart是二叉树
    预测误差： Cart使用gini/Variance，而C4.5使用entropy
    
    

计算信息增益率：
![image.png](attachment:image.png)

其中，分母是特征的熵：
![image-2.png](attachment:image-2.png)

**计算特征的熵**

In [42]:
def ent(x, weight=None):
    """
    计算特征的熵/标签的熵
    :param x :数据集的某一个特征的全部对应取值（特征熵）或者标签的熵
    :param weight :每一个样本对应的权重,
    :return 特征的熵/标签的熵
    
    """
    # weight非空时计算的是标签的熵
    if weight is None:# 说明计算的是特征熵，将weight初始化为全1
        weight = np.ones(len(x))
        
    unique_x = np.unique(x)
    weight_sum = np.sum(weight)
    p_x = [np.sum(weight[x==a])/weight_sum for a in unique_x]
    return np.sum([-p * np.log2(p) for  p in p_x])

In [3]:
# 训练数据，西瓜数据集2.0
"""

        # 1
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 2
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        # 3
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 4
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        # 5
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 6
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
        # 7
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
        # 8
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],

        # ----------------------------------------------------
        # 9
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
        # 10
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
        # 11
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
        # 12
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
        # 13
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
        # 14
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
        # 15
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
        # 16
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
        # 17
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
    ]

    # 特征值列表
    labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
    
'色泽', 0-乌黑，1-青绿，2-浅白
'根蒂', '敲击', '纹理', '脐部', '触感'

"""
X_train = np.array([
       [2, 1, 1, 1, 1, 1],
       [1, 1, 2, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       [2, 1, 2, 1, 1, 1],
       [3, 1, 1, 1, 1, 1],
       [2, 2, 1, 1, 2, 2],
       [1, 2, 1, 2, 2, 2],
       [1, 2, 1, 1, 2, 1],
       [1, 2, 2, 2, 2, 1],
       [2, 3, 3, 1, 3, 2],
       [3, 3, 3, 3, 3, 1],
       [3, 1, 1, 3, 3, 2],
       [2, 2, 1, 2, 1, 1],
       [3, 2, 2, 2, 1, 1],
       [1, 2, 1, 1, 2, 2],
       [3, 1, 1, 3, 3, 1],
       [2, 1, 2, 2, 2, 1]])
y_train = np.array([1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0])
weight = np.ones(len(X_train))


for i in range(X_train.shape[1]):#计算每个特征的熵
    print(ent(X_train[:,i]))
# 索引的熵
print(ent(np.arange(len(X_train))))

1.5798634010685344
1.402081402756032
1.3328204045850196
1.4466479595102752
1.548565226030918
0.8739810481273578
4.087462841250339


计算信息增益率时，由上述公式求得特征的信息熵后，接下来是求信息增益
![image.png](attachment:image.png)

**信息增益info_gain**

    划分前的信息熵 - 划分后的信息熵
    
    对于划分后的信息熵：
    对于离散特征，C4.5划分的是多叉树：
    由于C4.5是多叉树，不用像Cart一样对某一个特征的所有候选划分点来循环寻找最佳划分点了，而是同时将所有候选划分点作为划分点一次性分多叉。
    对于连续特征，C4.5与Cart相同，划分的是二叉树，所有要使用循环寻找最佳划分点：
![image-2.png](attachment:image-2.png)

特征的划分流程—离散与连续的处理方式不同    

    连续特征像Cart一样因为是二叉划分，但是有很多分割点，所以需要用与Cart一样的方法寻找最佳划分点
    离散特征有很多分割点，每一分割点都对应一个树分支，所以产生的是多叉树 
   **因此如果像Cart一样先划分得到branch_indices再进行递归建树的话，多叉树太复杂了**
    
    所以先将二者拆开来：先找最佳特征，计算所有特征的划分后对应的熵，此时不做切分，只计算entropy、info_gain_rate,
    找到最佳特征后再进行真正的划分，一旦划分后马上进行递归建树

**连续型特征**

1.找最佳特征：先计算所有特征的划分后对应的熵

In [4]:
# 处理连续特征时

"""
计算连续型feature_i的entropy
:param node_indices:当前节点的索引

"""
best_entropy = 0
X_node = X[node_indices]
y_node = y[node_indices]
weight_node = weight[node_indices]
# 1、确定特征i的所有可能取值
feature_values = np.unique(X_node[:, feature_i])# 已排序的
split_points = (feature_values[1:] + feature_values[:-1]) / 2 # 特征是连续型特征则使用二分法找到所有的切分点
# 连续型feature_i需要找到最佳分割点代表它
for point in split_points:
    cond = (X_node[:, feature_i] <= point)
    left_entropy = (np.sum(weight_node[cond])/np.sum(weight_node))*ent(y_node[cond],weight_node[cond])
    right_entropy = (np.sum(weight_node[~cond])/np.sum(weight_node))*ent(y_node[~cond], weight_node[~cond])
    cur_entropy = left_entropy + right_entropy
    if cur_entropy <= best_entropy:
        best_entropy = cur_entropy
        best_split_point = point
# 此时best split_point,best_entropy就代表了feature_i的entropy

NameError: name 'X_node' is not defined

2.真正划分

In [None]:
# 找到了最佳划分点,进行真正的划分（取对应的indices）
# 取对应的indices
left_indices = [i for i in node_indices if X[i, feature_i]<= best_split_point]
right_indices = [i for i in node_indices if ~(X[i, feature_i] <= best_split_point)]
# 左右分支递归建树

**离散型特征**

1.找最佳特征-先计算所有特征的划分后对应的熵

In [None]:
# 处理离散特征时
X_node = X[node_indices]
y_node = y[node_indices]
weight_node = weight[node_indices]
entropy = 0
# 1、确定特征i的所有可能取值
feature_values = np.unique(X_node[:, feature_i])# 已排序的
# 离散型特征直接使用特征的各个取值作为切分点
split_points = feature_values

for point in split_points:
    cond = (X_node[:, feature_i] == point)
    cur_entropy = (np.sum(weight_node[cond])/np.sum(weight_node))*ent(y_node[cond], weight_node[cond])
    entropy += cur_entropy
    
    

2.真正划分

In [None]:
for point in split_points:
    # 对应的indices
    cur_idx = [i for i in node_indices if X[i, feature_i]==point]
    # 多分支递归建树
            

统一离散/连续性特征的求每一个特征对应划分后的熵过程

In [50]:
def get_feature_entropy(X, y, weight, feature_i, is_linear):
    """
    计算特征i的entropy
    :param X :当前结点考虑的样本（无缺失）
    :param y : 当前结点考虑的样本对应标签（无缺失） 
    :param feature_i : 当前欲计算的特征
    :param is_linear : 特征i是否连续型
    :return 特征i的entropy,如果是连续特征，还会返回特征对应的最佳切分点
    
    """
    # 初始化
    entropy = 0
    best_split_point = None # 只有连续特征才用得到
    
    # 1、确定特征i的所有可能取值
    feature_values = np.unique(X[:, feature_i])# 已排序的
    if is_linear:
        # 特征是连续型特征则使用二分法找到所有的切分点
        split_points = (feature_values[1:] + feature_values[:-1]) / 2 
        # 连续型feature_i需要找到最佳分割点代表它
        best_split_entropy = 999
        for point in split_points:
            cond = (X[:, feature_i] <= point)
            left_entropy = (np.sum(weight[cond])/np.sum(weight))*ent(y[cond],weight[cond])
            right_entropy= (np.sum(weight[~cond])/np.sum(weight))*ent(y[~cond], weight[~cond])
            cur_entropy = left_entropy + right_entropy
            if cur_entropy <= best_split_entropy:
                best_split_entropy = cur_entropy
                best_split_point = point
        # 此时best split_point,best_entropy就代表了feature_i的entropy
        entropy = best_split_entropy
        
    else:
        # 离散型特征直接使用特征的各个取值作为切分点
        split_points = feature_values
        for point in split_points:
            cond = (X[:, feature_i] == point)
            cur_entropy = (np.sum(weight[cond])/np.sum(weight)) * ent(y[cond], weight[cond])
            entropy += cur_entropy
    return entropy,best_split_point  

In [5]:
# 训练数据，西瓜数据集2.0
"""

        # 1
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 2
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        # 3
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 4
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],
        # 5
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],
        # 6
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],
        # 7
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],
        # 8
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],

        # ----------------------------------------------------
        # 9
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],
        # 10
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],
        # 11
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],
        # 12
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],
        # 13
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'],
        # 14
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],
        # 15
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],
        # 16
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],
        # 17
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']
    ]

    # 特征值列表
    labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感']
    
'色泽', 0-乌黑，1-青绿，2-浅白
'根蒂', '敲击', '纹理', '脐部', '触感'

"""
X_train = np.array([
       [2, 1, 1, 1, 1, 1],
       [1, 1, 2, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       [2, 1, 2, 1, 1, 1],
       [3, 1, 1, 1, 1, 1],
       [2, 2, 1, 1, 2, 2],
       [1, 2, 1, 2, 2, 2],
       [1, 2, 1, 1, 2, 1],
       [1, 2, 2, 2, 2, 1],
       [2, 3, 3, 1, 3, 2],
       [3, 3, 3, 3, 3, 1],
       [3, 1, 1, 3, 3, 2],
       [2, 2, 1, 2, 1, 1],
       [3, 2, 2, 2, 1, 1],
       [1, 2, 1, 1, 2, 2],
       [3, 1, 1, 3, 3, 1],
       [2, 1, 2, 2, 2, 1]])
y_train = np.array([1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0])
weight = np.ones(len(X_train))

best_gain = 0
best_feature_i = None
for i in range(X_train.shape[1]):
    feature_entropy, best_split_point = get_feature_entropy(X_train, y_train,weight,i, is_linear=False)
    info_gain = ent(y_train, weight) - feature_entropy
    print(info_gain, best_split_point)
    if info_gain >= best_gain:
        best_gain = info_gain
        best_feature_i = i
print(best_gain, best_feature_i)

0.10812516526536531 None
0.14267495956679288 None
0.14078143361499584 None
0.3805918973682686 None
0.28915878284167895 None
0.006046489176565584 None
0.3805918973682686 3


In [51]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
data = load_iris()
X,y = data.data, data.target
X_train,test_X,y_train,test_y = train_test_split(X,y,test_size=0.3)
get_feature_entropy(X_train, y_train,np.ones(len(X_train)), 0, is_linear=True )

(1.0152515967858389, 5.55)

找最佳特征

**注意**
    下述算法直接选择信息率最高的特征进行划分，实际上由于增益率准则对可取值数目较少的属性有所偏好，所以实际上应该选用启发式的方式：先从候选划分属性中找**信息增益**高于平均的属性， 再从中选择**增益率**最高的

In [None]:
"""
找到最佳特征
: param X : 所有特征
: param y : 所有标签
: param node_indices : 当前考虑的样本集合对应的索引（无缺失值）
: param is_linear : 当前所有特征的类型

"""
# 初始化
best_feature_i = None
best_info_gain_rate = 0
n_features = X.shape[1]
for feature_i in range(n_features):
    # 求每一个特征对应的熵
    cur_feature_entropy = get_feature_entropy(X[node_indices], y[node_indices], weight[node_indices], feature_i, is_linear)
    # 求每一个特征对应的info_gain_rate
    cur_info_gain_rate = (ent(y[node_indices], weight[node_indices]) - cur_feature_entropy) / ent(X[node_indices][:, feature_i])
    if cur_info_gain_rate >= best_info_gain_rate:
        best_info_gain_rate = cur_info_gain_rate
        best_feature_i = feature_i
# 找到了最佳特征及其对应的info_gain_rate

上述代码加上缺失值处理

In [33]:
# 当前样本X,y,weight,node_indices
"""
找到最佳特征
:param X : 所有样本
:param y : 所有样本对应的标签
:param weight : 所有样本对应的权重
:param node_indices : 当前样本对应的索引
:param is_linear : 所有特征的属性是连续型还是离散型

"""

# 初始化
best_info_gain_rate = 0
best_feature_i = None

# 特征选择
n_features = X.shape[1]
for feature_i in range(n_features):
    ## 找出未缺失的样本
    nonan_indices = [i  for i in node_indices if ~np.isnan(X[i, feature_i])]
    # 找出缺失样本
    nan_indices = [i  for i in node_indices if np.isnan(X[i, feature_i])]
    ## 进行特征选择时使用信息增益率（info_gain_rate）
    # 1.求本特征的信息熵
    cur_feature_entropy = get_feature_entropyt(X[nonan_indices], y[nonan_indices], weight[nonan_indices], is_linear)
    # 2.求信息增益info_gain_rate:(划分前的信息熵 - 划分后的信息熵)/本特征的熵
    cur_info_gini_rate = (ent(y[nonan_indices], weight[nonan_indices]) - cur_feature_entropy)/ent(X[nonan_indices][:, feature_i])
    # 3.计算所有特征的加权info_gini_rate,找到最佳特征
    # 无缺失值样本所占的比例:对每一个样本赋予了权重后,利用权重计算无缺失样本所占的比例
    lou = np.sum(weight[nonan_indices]) / np.sum(weight[node_indices])
    cur_info_gini_rate = lou * cur_info_gini_rate
    # 找到最佳特征
    if cur_info_gini_rate >= best_info_gain_rate:
        best_info_gain_rate = cur_info_gini_rate
        best_feature = feature_i
# 自此完成特征选择    
    

SyntaxError: invalid syntax (2489435387.py, line 20)

模块化

In [52]:
def get_best_split_feature(X,y, weight, node_indices, is_linear):
    """
    找到最佳特征完成特征选择
    :param X : 所有样本
    :param y : 所有样本对应的标签
    :param weight : 所有样本对应的权重
    :param node_indices : 当前样本对应的索引
    :param is_linear : 所有特征的属性是连续型还是离散型

    """

    # 初始化
    best_info_gain_rate = 0
    best_sets = None
    # 特征选择
    n_features = X.shape[1]
    for feature_i in range(n_features):
        ## 找出未缺失的样本
        nonan_indices = [i for i in node_indices if ~np.isnan(X[i, feature_i])]
        # 找出缺失样本
        nan_indices = [i for i in node_indices if np.isnan(X[i, feature_i])]
        ## 进行特征选择时使用信息增益率（info_gain_rate）
        # 1.求本特征的信息熵
        cur_entropy, split_point = get_feature_entropy(X[nonan_indices], y[nonan_indices], weight[nonan_indices],feature_i, is_linear)
        # 2.求信息增益info_gain_rate:(划分前的信息熵 - 划分后的信息熵)/本特征的熵
        cur_info_gini_rate = (ent(y[nonan_indices], weight[nonan_indices]) - cur_entropy)/ent(X[nonan_indices][:, feature_i])
        # 3.计算所有特征的加权info_gini_rate,找到最佳特征
        # 无缺失值样本所占的比例:对每一个样本赋予了权重后,利用权重计算无缺失样本所占的比例
        lou = np.sum(weight[nonan_indices]) / np.sum(weight[node_indices])
        cur_info_gini_rate = lou * cur_info_gini_rate

        # 找到最佳特征
        if cur_info_gini_rate >= best_info_gain_rate:
            best_info_gain_rate = cur_info_gini_rate
            best_sets = {
                "best_feature_i": best_feature_i,
                "best_split_point": split_point,# 只有连续特征才用得到
                "nonan_indices": nonan_indices,
                "nan_indices": nan_indices,
            }
            
    return best_info_gain_rate, best_sets

In [10]:
X_train = np.array([
       [2, 1, 1, 1, 1, 1],
       [1, 1, 2, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       [2, 1, 2, 1, 1, 1],
       [3, 1, 1, 1, 1, 1],
       [2, 2, 1, 1, 2, 2],
       [1, 2, 1, 2, 2, 2],
       [1, 2, 1, 1, 2, 1],
       [1, 2, 2, 2, 2, 1],
       [2, 3, 3, 1, 3, 2],
       [3, 3, 3, 3, 3, 1],
       [3, 1, 1, 3, 3, 2],
       [2, 2, 1, 2, 1, 1],
       [3, 2, 2, 2, 1, 1],
       [1, 2, 1, 1, 2, 2],
       [3, 1, 1, 3, 3, 1],
       [2, 1, 2, 2, 2, 1]])
y_train = np.array([1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0])
weight = np.ones(len(X_train))
get_best_split_feature(X_train, y_train, weight, range(len(X_train)), is_linear=False)

(0.2630853587192754,
 {'best_feature_i': 3,
  'best_split_point': None,
  'nonan_indices': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
  'nan_indices': []})

**用找到的特征进行真正的划分**

真正划分时还要整合已缺失样本

由于需要直接递归建立分支树，所有整合在递归建树的代码中

`在进行递归建树时，结点的儿子只有在Cart中由于只有左右儿子，所有只需要left_node和right_node两个状态，现在有许多孩子，需要使用列表了`

In [None]:
# 基于最佳特征进行决策树划分
node.child_node = {}# 用字典保存孩子结点
if is_linear and best_split_point is not None: # 连续特征使用best_split_point进行二叉划分
    # 找到了最佳划分点,进行真正的划分（取对应的indices）
    left_indices = [i for i in node_indices if X[i, best_feature_i]<= best_split_point]
    right_indices = [i for i in node_indices if ~(X[i, best_feature_i] <= best_split_point)]
    # 整合已缺失样本
    left_indices.extend(nan_indices)
    right_indices.extend(nan_indices)
    # 修改权重,将缺失样本按不同的权重放到两个分支中
    left_weight, right_weight = np.zeros_like(weight),np.zeros_like(weight)
    left_weight[left_indices], right_weight[right_indices] = weight[left_indices], weight[right_indices]
    left_weight[nan_indices], right_weight[nan_indices] = np.sum(weight[left_indices]) / np.sum(weight[nonan_indices]),np.sum(weight[right_indices]) / np.sum(weight[nonan_indices])
    # ---------------左右分支递归建树--------------------
    node.child_node["left"] = Node()
    node.right_node["right"] = Node()
    # 让递归函数帮忙建立左右子树
    
else:# 离散特征直接使用所有划分点进行多叉划分
    split_points = np.unique(X[:, best_feature_i])
    for point in split_points:
    # 对应的indices
    cur_idx = [i for i in node_indices if X[i, best_feature_i] ==point]
    # 修改权重,将缺失样本按不同的权重放到多个分支中
    cur_weight = np.zeros_like(weight)
    cur_weight[cur_idx] = weight[cur_idx]
    cur_weight[nan_indices] = np.sum(weight[cur_idx])/np.sum(weight[nonan_indices])
    # ---------------多分支递归建树------------------
    # 记录当前结点的状态：使用列表记录众多孩子
    node.child_node[point] = Node()
    
    
    

In [None]:
def split(X,weight, node_indices, best_sets,is_linear ):
    """
    对当前决策树进行真正的划分
    : param X : 所有样本
    : param weight : 所有权重
    : param node_indices: 当前考虑的结点
    : param best_sets:最佳划分对应的信息，
                best_sets = {
                "best_feature_i": best_feature_i,
                "best_split_point": split_point,# 只有连续特征才用得到
                "nonan_indices": nonan_indices,
                "nan_indices": nan_indices,
            }
    : param 是否为连续特征
    
    """
    

决策树建立流程


In [11]:
class Node(object):
    """通过树结点的各属性记录生成的树结构"""
    def __init__(self,
                 best_feature_i=None, 
                 best_split_point=None,
                 child_nodes = None,
                 leaf_class = None,
                 is_leaf=False,
                 entropy=None):
        """
        每个当前结点Node都记录了当前的划分状况
        :param child_nodes : 结点的所有子结点，字典存储
        :param best_feature_i : 当前结点的最佳划分特征
        :param best_split_point : 当前结点的最佳特征对应的最佳分割点
        :param leaf_class : 记录当前节点所属的类别
        :param is_leaf : 只有在is_leaf==True时，leaf_class才生效
        :param entropy : 当前节点的entropy
        
        """
        self.best_feature_i = best_feature_i
        self.best_split_point = best_split_point
        self.child_nodes = child_nodes
        self.leaf_class = leaf_class
        self.is_leaf = is_leaf
        self.entropy = entropy

In [14]:
 def _build_tree_recussive(self, X,y, node_indices,weight,node:Node, cur_depth, is_linear):
        """
        对于当前节点集合（X，y）-node_indices,递归建立决策树
        :param X: 所有样本
        :param y: 所有标签
        :param node_indices : 当前样本集合对应的索引
        :param weight : 所有样本对应的权重
        :param node : 当前结点的状态记录

        """
        n_samples,n_features = len(node_indices), X.shape[1]
        # 记录本节点的状态
        node.entropy = self._ent(y[node_indices], weight[node_indices])
        node.leaf_class = self._majority_vote(y[node_indices], weight[node_indices])

        ## 递归基
        # 节点包含数据属于同一个类别，此时无需划分
        if len(np.unique(y[node_indices])) <= 1:
            # 记录叶子结点所属的分类
            node.is_leaf = True
            return
        # 没有更多特征(当前节点所含样本所有特征都只有一个取值)
        if np.sum([len(np.unique(X[node_indices][:,i])) for i in range(n_features)]) == n_features:
            node.is_leaf = True
            return
        # 限制构建子树的深度
        if cur_depth >= self.max_depth:
            node.is_leaf = True
            return
        # 限制节点的最小样本量
        if n_samples < self.min_sample_split:
            node.is_leaf = True
            return

        ## 处理当前节点自身(X,y, node_indices,weight)
        # 找到最佳特征和对应的info_gain_rate    
        best_info_gain_rate, best_sets = self._get_best_split_feature(X, y, node_indices, weight, is_linear)
        """
        最佳划分对应的信息，best_sets = {
                "best_feature_i": best_feature_i,
                "best_split_point": split_point,# 只有连续特征才用得到
                "nonan_indices": nonan_indices,
                "nan_indices": nan_indices,
            }
        
        """
        
        ## 使用找到的最佳结点进行真正划分（整合已缺失样本）
        
        # 划分前先判断：信息增益率的最小值（info_gain_rate太小时不划分）
        if self.min_info_gain_rate is not None and  best_info_gain_rate < self.min_info_gain_rate:
            node.is_leaf = True
            return
        
        # 基于最佳特征进行决策树划分
        nonan_indices, nan_indices = best_sets["nonan_indices"], best_sets["nan_indices"]
        best_feature_i, best_split_point = best_sets["best_feature_i"], best_sets["best_split_point"]
        
        # 记录本节点的状态
        node.best_feature_i = best_feature_i
        node.best_split_point = best_split_point
        node.child_nodes = {} # 记录节点的子节点
        # --leaf_class和info_entropy在递归基时记录，node在划分时记录
        
        # 连续/离散特征采用不同方式进行划分
        if is_linear and best_split_point is not None: # 连续特征使用best_split_point进行二叉划分
            # 找到了最佳划分点,进行真正的划分（取对应的indices）
            left_indices = [i for i in node_indices if X[i, best_feature_i]<= best_split_point]
            right_indices = [i for i in node_indices if ~(X[i, best_feature_i] <= best_split_point)]
            # 整合已缺失样本
            left_indices.extend(nan_indices)
            right_indices.extend(nan_indices)
            # 修改权重,将缺失样本按不同的权重放到两个分支中
            left_weight, right_weight = np.zeros_like(weight),np.zeros_like(weight)
            left_weight[left_indices], right_weight[right_indices] = weight[left_indices], weight[right_indices]
            left_weight[nan_indices], right_weight[nan_indices] = np.sum(weight[left_indices]) / np.sum(weight[nonan_indices]),np.sum(weight[right_indices]) / np.sum(weight[nonan_indices])
            # ---------------左右分支递归建树--------------------
            # 记录本节点的状态
            node.child_nodes["left"] = Node()
            node.child_nodes["right"] = Node()
            # 让递归函数帮忙建立左右子树
            self._build_tree_recussive(X,y,left_indices,left_weight, node.child_nodes["left"], cur_depth+1, is_linear)
            self._build_tree_recussive(X,y,right_indices,right_weight, node.child_nodes["right"], cur_depth+1, is_linear)

        else:# 离散特征直接使用所有划分点进行多叉划分
            split_points = np.unique(X[:, best_feature_i])
            for point in split_points:
                # 对应的indices
                cur_idx = [i for i in node_indices if X[i, best_feature_i] == point]
                # 修改权重,将缺失样本按不同的权重放到多个分支中
                cur_weight = np.zeros_like(weight)
                cur_weight[cur_idx] = weight[cur_idx]
                cur_weight[nan_indices] = np.sum(weight[cur_idx])/np.sum(weight[nonan_indices])
                # ---------------多分支递归建树------------------
                # 记录本节点的状态
                node.child_nodes[point] = Node()
                # 让递归函数帮忙
                self._build_tree_recussive(X,y, cur_idx, cur_weight, node.child_nodes[point], cur_depth + 1, is_linear)

        

模块化

**决策树构建含预剪枝**

In [33]:
class Node(object):
    """通过树结点的各属性记录生成的树结构"""
    def __init__(self,
                 best_feature_i=None, 
                 best_split_point=None,
                 child_nodes = None,
                 leaf_class = None,
                 is_leaf=False,
                 entropy=None):
        """
        每个当前结点Node都记录了当前的划分状况
        :param child_nodes : 结点的所有子结点，字典存储
        :param best_feature_i : 当前结点的最佳划分特征
        :param best_split_point : 当前结点的最佳特征对应的最佳分割点
        :param leaf_class : 记录当前节点所属的类别
        :param is_leaf : 只有在is_leaf==True时，leaf_class才生效
        :param entropy : 当前节点的entropy
        
        """
        self.best_feature_i = best_feature_i
        self.best_split_point = best_split_point
        self.child_nodes = child_nodes
        self.leaf_class = leaf_class
        self.is_leaf = is_leaf
        self.entropy = entropy
        
class DecisionTree():
    """使用cart算法构建决策树"""
    
    def __init__(self, max_depth = float("inf"),min_sample_split=2, min_info_gain_rate=None):
        # 代表决策树的决策树根节点
        self.root_node = None 
        # 预设的决策树最大深度
        self.max_depth = max_depth
        # 预设的决策树叶子节点最小样本数
        self.min_sample_split = min_sample_split
        # 预设的基尼系数增益的最小值（gini_gain太小时不划分）
        self.min_info_gain_rate  = min_info_gain_rate 
    def fit(self, X,y,is_linear=False):
        """
        决策树拟合
        :param X : 训练数据集∈（m,n）
        :param y : 训练标签∈（n,1）
        :param is_linear : 特征是否为连续型
        
        """
        # 创建决策树根结点
        self.root_node = Node()
        # 默认根节点的深度为1
        cur_depth = 1
        # 根节点的初始化权重
        # 样本的初始权重:都为1
        weight = np.ones((len(X))) # 全局的weight:初始化为全1 
        # 递归构建决策树
        self._build_tree_recussive(X,y,np.arange(len(X)),weight,self.root_node, cur_depth, is_linear)
    
    def _build_tree_recussive(self, X,y, node_indices,weight,node:Node, cur_depth, is_linear):
        """
        对于当前节点集合（X，y）-node_indices,递归建立决策树
        :param X: 所有样本
        :param y: 所有标签
        :param node_indices : 当前样本集合对应的索引
        :param weight : 所有样本对应的权重
        :param node : 当前结点的状态记录

        """
        n_samples,n_features = len(node_indices), X.shape[1]
        # 记录本节点的状态
        node.entropy = self._ent(y[node_indices], weight[node_indices])
        node.leaf_class = self._majority_vote(y[node_indices], weight[node_indices])

        ## 递归基
        # 节点包含数据属于同一个类别，此时无需划分
        if len(np.unique(y[node_indices])) <= 1:
            # 记录叶子结点所属的分类
            node.is_leaf = True
            return
        # 没有更多特征(当前节点所含样本所有特征都只有一个取值)
        if np.sum([len(np.unique(X[node_indices][:,i])) for i in range(n_features)]) == n_features:
            node.is_leaf = True
            return
        # 限制构建子树的深度
        if cur_depth >= self.max_depth:
            node.is_leaf = True
            return
        # 限制节点的最小样本量
        if n_samples < self.min_sample_split:
            node.is_leaf = True
            return

        ## 处理当前节点自身(X,y, node_indices,weight)
        # 找到最佳特征和对应的info_gain_rate    
        best_info_gain_rate, best_sets = self._get_best_split_feature(X, y, weight, node_indices, is_linear)
        """
        最佳划分对应的信息，best_sets = {
                "best_feature_i": best_feature_i,
                "best_split_point": split_point,# 只有连续特征才用得到
                "nonan_indices": nonan_indices,
                "nan_indices": nan_indices,
            }
        
        """
        
        ## 使用找到的最佳结点进行真正划分（整合已缺失样本）
        
        # 划分前先判断：信息增益率的最小值（info_gain_rate太小时不划分）
        if self.min_info_gain_rate is not None and  best_info_gain_rate < self.min_info_gain_rate:
            node.is_leaf = True
            return
        
        # 基于最佳特征进行决策树划分
        nonan_indices, nan_indices = best_sets["nonan_indices"], best_sets["nan_indices"]
        best_feature_i, best_split_point = best_sets["best_feature_i"], best_sets["best_split_point"]
        
        # 记录本节点的状态
        node.best_feature_i = best_feature_i
        node.best_split_point = best_split_point
        node.child_nodes = {} # 记录节点的子节点
        # --leaf_class和info_entropy在递归基时记录，node在划分时记录
        
        # 连续/离散特征采用不同方式进行划分
        if is_linear and best_split_point is not None: # 连续特征使用best_split_point进行二叉划分
            # 找到了最佳划分点,进行真正的划分（取对应的indices）
            left_indices = [i for i in node_indices if X[i, best_feature_i] <= best_split_point]
            right_indices = [i for i in node_indices if ~(X[i, best_feature_i] <= best_split_point)]
            # 整合已缺失样本
            left_indices.extend(nan_indices)
            right_indices.extend(nan_indices)
            # 修改权重,将缺失样本按不同的权重放到两个分支中
            left_weight, right_weight = np.zeros_like(weight),np.zeros_like(weight)
            left_weight[left_indices], right_weight[right_indices] = weight[left_indices], weight[right_indices]
            left_weight[nan_indices], right_weight[nan_indices] = np.sum(weight[left_indices]) / np.sum(weight[nonan_indices]),np.sum(weight[right_indices]) / np.sum(weight[nonan_indices])
            # ---------------左右分支递归建树--------------------
            # 记录本节点的状态
            node.child_nodes["left"] = Node()
            node.child_nodes["right"] = Node()
            # 让递归函数帮忙建立左右子树
            self._build_tree_recussive(X,y,left_indices,left_weight, node.child_nodes["left"], cur_depth+1, is_linear)
            self._build_tree_recussive(X,y,right_indices,right_weight, node.child_nodes["right"], cur_depth+1, is_linear)

        else:# 离散特征直接使用所有划分点进行多叉划分
            split_points = np.unique(X[:, best_feature_i])
            for point in split_points:
                # 对应的indices
                cur_idx = [i for i in node_indices if X[i, best_feature_i] == point]
                # 修改权重,将缺失样本按不同的权重放到多个分支中
                cur_weight = np.zeros_like(weight)
                cur_weight[cur_idx] = weight[cur_idx]
                cur_weight[nan_indices] = np.sum(weight[cur_idx])/np.sum(weight[nonan_indices])
                # ---------------多分支递归建树------------------
                # 记录本节点的状态
                node.child_nodes[point] = Node()
                # 让递归函数帮忙
                self._build_tree_recussive(X,y, cur_idx, cur_weight, node.child_nodes[point], cur_depth + 1, is_linear)

        
                                   
    def _get_best_split_feature(self, X,y, weight, node_indices, is_linear):
        """
        找到最佳特征完成特征选择
        :param X : 所有样本
        :param y : 所有样本对应的标签
        :param weight : 所有样本对应的权重
        :param node_indices : 当前样本对应的索引
        :param is_linear : 所有特征的属性是连续型还是离散型

        """

        # 初始化
        best_info_gain_rate = 0
        best_sets = None
        # 特征选择
        n_features = X.shape[1]
        for feature_i in range(n_features):
            ## 找出未缺失的样本
            nonan_indices = [i for i in node_indices if ~np.isnan(X[i, feature_i])]
            # 找出缺失样本
            nan_indices = [i for i in node_indices if np.isnan(X[i, feature_i])]
            ## 进行特征选择时使用信息增益率（info_gain_rate）
            # 1.求本特征的信息熵
            cur_entropy, split_point = self._get_feature_entropy(X[nonan_indices], y[nonan_indices], weight[nonan_indices],feature_i, is_linear)
            # 2.求信息增益info_gain_rate:(划分前的信息熵 - 划分后的信息熵)/本特征的熵
            # 求本特征的熵（注意分母可能为0）
            IV = self._ent(X[nonan_indices][:, feature_i])
            if IV != 0:
                cur_info_gini_rate = (self._ent(y[nonan_indices], weight[nonan_indices]) - cur_entropy)/ IV
            else:
                cur_info_gini_rate = 0
            # 3.计算所有特征的加权info_gini_rate,找到最佳特征
            # 无缺失值样本所占的比例:对每一个样本赋予了权重后,利用权重计算无缺失样本所占的比例
            lou = np.sum(weight[nonan_indices]) / np.sum(weight[node_indices])
            cur_info_gini_rate = lou * cur_info_gini_rate

            # 找到最佳特征
            if cur_info_gini_rate >= best_info_gain_rate:
                best_info_gain_rate = cur_info_gini_rate
                best_sets = {
                    "best_feature_i": feature_i,
                    "best_split_point": split_point,# 只有连续特征才用得到
                    "nonan_indices": nonan_indices,
                    "nan_indices": nan_indices,
                }

        return best_info_gain_rate, best_sets
    
    def _get_feature_entropy(self, X, y, weight, feature_i, is_linear):
        """
        计算特征i的entropy
        :param X :当前结点考虑的样本（无缺失）
        :param y : 当前结点考虑的样本对应标签（无缺失） 
        :param feature_i : 当前欲计算的特征
        :param is_linear : 特征i是否连续型
        :return 特征i的entropy,如果是连续特征，还会返回特征对应的最佳切分点

        """
        # 初始化
        entropy = 0
        best_split_point = None # 只有连续特征才用得到

        # 1、确定特征i的所有可能取值
        feature_values = np.unique(X[:, feature_i])# 已排序的
        if is_linear:
            # 特征是连续型特征则使用二分法找到所有的切分点
            split_points = (feature_values[1:] + feature_values[:-1]) / 2 
            # 连续型feature_i需要找到最佳分割点代表它
            best_split_entropy = 0
            for point in split_points:
                cond = (X[:, feature_i] <= point)
                left_entropy = (np.sum(weight[cond])/np.sum(weight))*self._ent(y[cond],weight[cond])
                right_entropy= (np.sum(weight[~cond])/np.sum(weight))*self._ent(y[~cond], weight[~cond])
                cur_entropy = left_entropy + right_entropy
                if cur_entropy <= best_entropy:
                    best_split_ent = cur_entropy
                    best_split_point = point
            # 此时best split_point,best_entropy就代表了feature_i的entropy
            entropy = best_split_entropy

        else:
            # 离散型特征直接使用特征的各个取值作为切分点
            split_points = feature_values
            for point in split_points:
                cond = (X[:, feature_i] == point)
                cur_entropy = (np.sum(weight[cond])/np.sum(weight)) * self._ent(y[cond], weight[cond])
                entropy += cur_entropy
        return entropy,best_split_point  
    
     
    
    def _ent(self, x, weight=None):
        """
        计算特征的熵/标签的熵
        :param x :数据集的某一个特征的全部对应取值（特征熵）或者标签的熵
        :param weight :每一个样本对应的权重,
        :return 特征的熵/标签的熵

        """
        # weight非空时计算的是标签的熵
        if weight is None:# 说明计算的是特征熵，将weight初始化为全1
            weight = np.ones(len(x))

        unique_x = np.unique(x)
        weight_sum = np.sum(weight)
        p_x = [np.sum(weight[x==a])/weight_sum for a in unique_x]
        return np.sum([-p * np.log2(p) for  p in p_x])
    
    def _majority_vote(self, y,weight):
        """
        根据多数原则定叶子节点所属的分类
        :param y : 当前集合对应的标签
        :param weight : 前集合对应的权重
        :return most_common_class叶子节点所属的分类
        """
        most_common_class = None
        max_distribution = 0
        for k in np.unique(y): 
            distribution = np.sum(weight[y==k])
            if distribution >= max_distribution:
                max_distribution = distribution
                most_common_class = k
        return most_common_class
    
   

In [16]:
# 测试
X_train = np.array([
       [2, 1, 1, 1, 1, 1],
       [1, 1, 2, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       [2, 1, 2, 1, 1, 1],
       [3, 1, 1, 1, 1, 1],
       [2, 2, 1, 1, 2, 2],
       [1, 2, 1, 2, 2, 2],
       [1, 2, 1, 1, 2, 1],
       [1, 2, 2, 2, 2, 1],
       [2, 3, 3, 1, 3, 2],
       [3, 3, 3, 3, 3, 1],
       [3, 1, 1, 3, 3, 2],
       [2, 2, 1, 2, 1, 1],
       [3, 2, 2, 2, 1, 1],
       [1, 2, 1, 1, 2, 2],
       [3, 1, 1, 3, 3, 1],
       [2, 1, 2, 2, 2, 1]])
y_train = np.array([1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0])
weight = np.ones(len(X_train))
model = DecisionTree()
model.fit(X_train, y_train)

**决策树预测**
    如果特征是连续型，则是二叉树，选择分叉是按照当前特征值是否<=best_split_point
    如果特征是离散型，则是多叉树， 选择分叉是按照当前特征==point

In [60]:
class Node(object):
    """通过树结点的各属性记录生成的树结构"""
    def __init__(self,
                 best_feature_i=None, 
                 best_split_point=None,
                 child_nodes = None,
                 leaf_class = None,
                 is_leaf=False,
                 entropy=None):
        """
        每个当前结点Node都记录了当前的划分状况
        :param child_nodes : 结点的所有子结点，字典存储
        :param best_feature_i : 当前结点的最佳划分特征
        :param best_split_point : 当前结点的最佳特征对应的最佳分割点
        :param leaf_class : 记录当前节点所属的类别
        :param is_leaf : 只有在is_leaf==True时，leaf_class才生效
        :param entropy : 当前节点的entropy
        
        """
        self.best_feature_i = best_feature_i
        self.best_split_point = best_split_point
        self.child_nodes = child_nodes
        self.leaf_class = leaf_class
        self.is_leaf = is_leaf
        self.entropy = entropy
        
class DecisionTree():
    """使用cart算法构建决策树"""
    
    def __init__(self, max_depth = float("inf"),min_sample_split=2, min_info_gain_rate=None):
        # 代表决策树的决策树根节点
        self.root_node = None 
        # 预设的决策树最大深度
        self.max_depth = max_depth
        # 预设的决策树叶子节点最小样本数
        self.min_sample_split = min_sample_split
        # 预设的基尼系数增益的最小值（gini_gain太小时不划分）
        self.min_info_gain_rate  = min_info_gain_rate 
    def fit(self, X,y,is_linear=False):
        """
        决策树拟合
        :param X : 训练数据集∈（m,n）
        :param y : 训练标签∈（n,1）
        :param is_linear : 特征是否为连续型
        
        """
        # 创建决策树根结点
        self.root_node = Node()
        # 默认根节点的深度为1
        cur_depth = 1
        # 根节点的初始化权重
        # 样本的初始权重:都为1
        weight = np.ones((len(X))) # 全局的weight:初始化为全1 
        # 递归构建决策树
        self._build_tree_recussive(X,y,np.arange(len(X)),weight,self.root_node, cur_depth, is_linear)
    
    def _build_tree_recussive(self, X,y, node_indices,weight,node:Node, cur_depth, is_linear):
        """
        对于当前节点集合（X，y）-node_indices,递归建立决策树
        :param X: 所有样本
        :param y: 所有标签
        :param node_indices : 当前样本集合对应的索引
        :param weight : 所有样本对应的权重
        :param node : 当前结点的状态记录

        """
        n_samples,n_features = len(node_indices), X.shape[1]
        # 记录本节点的状态
        node.entropy = self._ent(y[node_indices], weight[node_indices])
        node.leaf_class = self._majority_vote(y[node_indices], weight[node_indices])

        ## 递归基
        # 节点包含数据属于同一个类别，此时无需划分
        if len(np.unique(y[node_indices])) <= 1:
            # 记录叶子结点所属的分类
            node.is_leaf = True
            return
        # 没有更多特征(当前节点所含样本所有特征都只有一个取值)
        if np.sum([len(np.unique(X[node_indices][:,i])) for i in range(n_features)]) == n_features:
            node.is_leaf = True
            return
        # 限制构建子树的深度
        if cur_depth >= self.max_depth:
            node.is_leaf = True
            return
        # 限制节点的最小样本量
        if n_samples < self.min_sample_split:
            node.is_leaf = True
            return

        ## 处理当前节点自身(X,y, node_indices,weight)
        # 找到最佳特征和对应的info_gain_rate    
        best_info_gain_rate, best_sets = self._get_best_split_feature(X, y, weight, node_indices, is_linear)
        """
        最佳划分对应的信息，best_sets = {
                "best_feature_i": best_feature_i,
                "best_split_point": split_point,# 只有连续特征才用得到
                "nonan_indices": nonan_indices,
                "nan_indices": nan_indices,
            }
        
        """
        
        ## 使用找到的最佳结点进行真正划分（整合已缺失样本）
        
        # 划分前先判断：信息增益率的最小值（info_gain_rate太小时不划分）
        if self.min_info_gain_rate is not None and  best_info_gain_rate < self.min_info_gain_rate:
            node.is_leaf = True
            return
        
        # 基于最佳特征进行决策树划分
        nonan_indices, nan_indices = best_sets["nonan_indices"], best_sets["nan_indices"]
        best_feature_i, best_split_point = best_sets["best_feature_i"], best_sets["best_split_point"]
        # 记录本节点的状态
        node.best_feature_i = best_feature_i
        node.best_split_point = best_split_point
        node.child_nodes = {} # 记录节点的子节点
        # --leaf_class和info_entropy在递归基时记录，node在划分时记录
        
        # 连续/离散特征采用不同方式进行划分
        if is_linear and best_split_point is not None: # 连续特征使用best_split_point进行二叉划分
            # 找到了最佳划分点,进行真正的划分（取对应的indices）
            left_indices = [i for i in node_indices if X[i, best_feature_i] <= best_split_point]
            right_indices = [i for i in node_indices if ~(X[i, best_feature_i] <= best_split_point)]
            # 整合已缺失样本
            left_indices.extend(nan_indices)
            right_indices.extend(nan_indices)
            # 修改权重,将缺失样本按不同的权重放到两个分支中
            left_weight, right_weight = np.zeros_like(weight),np.zeros_like(weight)
            left_weight[left_indices], right_weight[right_indices] = weight[left_indices], weight[right_indices]
            left_weight[nan_indices], right_weight[nan_indices] = np.sum(weight[left_indices]) / np.sum(weight[nonan_indices]),np.sum(weight[right_indices]) / np.sum(weight[nonan_indices])
            # ---------------左右分支递归建树--------------------
            # 记录本节点的状态
            node.child_nodes["left"] = Node()
            node.child_nodes["right"] = Node()
            # 让递归函数帮忙建立左右子树
            self._build_tree_recussive(X,y,left_indices,left_weight, node.child_nodes["left"], cur_depth+1, is_linear)
            self._build_tree_recussive(X,y,right_indices,right_weight, node.child_nodes["right"], cur_depth+1, is_linear)

        else:# 离散特征直接使用所有划分点进行多叉划分
            split_points = np.unique(X[:, best_feature_i])
            for point in split_points:
                # 对应的indices
                cur_idx = [i for i in node_indices if X[i, best_feature_i] == point]
                # 修改权重,将缺失样本按不同的权重放到多个分支中
                cur_weight = np.zeros_like(weight)
                cur_weight[cur_idx] = weight[cur_idx]
                cur_weight[nan_indices] = np.sum(weight[cur_idx])/np.sum(weight[nonan_indices])
                # ---------------多分支递归建树------------------
                # 记录本节点的状态
                node.child_nodes[point] = Node()
                # 让递归函数帮忙
                self._build_tree_recussive(X,y, cur_idx, cur_weight, node.child_nodes[point], cur_depth + 1, is_linear)

        
                                   
    def _get_best_split_feature(self, X,y, weight, node_indices, is_linear):
        """
        找到最佳特征完成特征选择
        :param X : 所有样本
        :param y : 所有样本对应的标签
        :param weight : 所有样本对应的权重
        :param node_indices : 当前样本对应的索引
        :param is_linear : 所有特征的属性是连续型还是离散型

        """

        # 初始化
        best_info_gain_rate = 0
        best_sets = None
        # 特征选择
        n_features = X.shape[1]
        for feature_i in range(n_features):
            ## 找出未缺失的样本
            nonan_indices = [i for i in node_indices if ~np.isnan(X[i, feature_i])]
            # 找出缺失样本
            nan_indices = [i for i in node_indices if np.isnan(X[i, feature_i])]
            ## 进行特征选择时使用信息增益率（info_gain_rate）
            # 1.求本特征的信息熵
            cur_entropy, split_point = self._get_feature_entropy(X[nonan_indices], y[nonan_indices], weight[nonan_indices],feature_i, is_linear)
            # 2.求信息增益info_gain_rate:(划分前的信息熵 - 划分后的信息熵)/本特征的熵
            # 求本特征的熵（注意分母可能为0）
            IV = self._ent(X[nonan_indices][:, feature_i])
            if IV != 0:
                cur_info_gini_rate = (self._ent(y[nonan_indices], weight[nonan_indices]) - cur_entropy)/ IV
            else:
                cur_info_gini_rate = 0
            # 3.计算所有特征的加权info_gini_rate,找到最佳特征
            # 无缺失值样本所占的比例:对每一个样本赋予了权重后,利用权重计算无缺失样本所占的比例
            lou = np.sum(weight[nonan_indices]) / np.sum(weight[node_indices])
            cur_info_gini_rate = lou * cur_info_gini_rate

            # 找到最佳特征
            if cur_info_gini_rate >= best_info_gain_rate:
                best_info_gain_rate = cur_info_gini_rate
                best_sets = {
                    "best_feature_i": feature_i,
                    "best_split_point": split_point,# 只有连续特征才用得到
                    "nonan_indices": nonan_indices,
                    "nan_indices": nan_indices,
                }

        return best_info_gain_rate, best_sets
    
    def _get_feature_entropy(self, X, y, weight, feature_i, is_linear):
        """
        计算特征i的entropy
        :param X :当前结点考虑的样本（无缺失）
        :param y : 当前结点考虑的样本对应标签（无缺失） 
        :param feature_i : 当前欲计算的特征
        :param is_linear : 特征i是否连续型
        :return 特征i的entropy,如果是连续特征，还会返回特征对应的最佳切分点

        """
        # 初始化
        entropy = 0
        best_split_point = None # 只有连续特征才用得到

        # 1、确定特征i的所有可能取值
        feature_values = np.unique(X[:, feature_i])# 已排序的
        if is_linear:
            # 特征是连续型特征则使用二分法找到所有的切分点
            split_points = (feature_values[1:] + feature_values[:-1]) / 2 
            # 连续型feature_i需要找到最佳分割点代表它
            best_split_entropy = 999
            for point in split_points:
                cond = (X[:, feature_i] <= point)
                left_entropy = (np.sum(weight[cond])/np.sum(weight))*self._ent(y[cond],weight[cond])
                right_entropy= (np.sum(weight[~cond])/np.sum(weight))*self._ent(y[~cond], weight[~cond])
                cur_entropy = left_entropy + right_entropy
                if cur_entropy <= best_split_entropy:
                    best_split_entropy = cur_entropy
                    best_split_point = point
            # 此时best split_point,best_entropy就代表了feature_i的entropy
            entropy = best_split_entropy

        else:
            # 离散型特征直接使用特征的各个取值作为切分点
            split_points = feature_values
            for point in split_points:
                cond = (X[:, feature_i] == point)
                cur_entropy = (np.sum(weight[cond])/np.sum(weight)) * ent(y[cond], weight[cond])
                entropy += cur_entropy
        return entropy,best_split_point  
    
     
    
    def _ent(self, x, weight=None):
        """
        计算特征的熵/标签的熵
        :param x :数据集的某一个特征的全部对应取值（特征熵）或者标签的熵
        :param weight :每一个样本对应的权重,
        :return 特征的熵/标签的熵

        """
        # weight非空时计算的是标签的熵
        if weight is None:# 说明计算的是特征熵，将weight初始化为全1
            weight = np.ones(len(x))

        unique_x = np.unique(x)
        weight_sum = np.sum(weight)
        p_x = [np.sum(weight[x==a])/weight_sum for a in unique_x]
        return np.sum([-p * np.log2(p) for  p in p_x])
    
    def _majority_vote(self, y,weight):
        """
        根据多数原则定叶子节点所属的分类
        :param y : 当前集合对应的标签
        :param weight : 前集合对应的权重
        :return most_common_class叶子节点所属的分类
        """
        most_common_class = None
        max_distribution = 0
        for k in np.unique(y): 
            distribution = np.sum(weight[y==k])
            if distribution >= max_distribution:
                max_distribution = distribution
                most_common_class = k
        return most_common_class
    
   
    def predict(self,X,is_linear=False):
        """
        :param X: 待预测的m个样本

        """
        # 每一个样本都通过多叉搜索决策树树查找所属类别,决策树由其根节点作为代表
        y_pred = [self._search_class(x, self.root_node, is_linear) for x in X]
        return y_pred

    def _search_class(self, x, node:Node, is_linear):
        """
        : param x: 待预测所属分类的样本
        : param node : 当前所在节点

        """
        # 递归基
        if node.is_leaf:# 已经走到叶子
            return node.leaf_class
        ## 当前节点的工作
        # 本样本最终要往哪个分支走
        goto = None
        # 根据当前节点的最佳特征及最佳切分点决定x是继续往左边走还是往右边走
        feature_value = x[node.best_feature_i]
        # 离散型/连续型特征处理不同
        if is_linear and node.best_split_point is not None:
            if feature_value <= node.best_split_point:
                goto = node.child_nodes["left"]# 往左边
            else:
                goto = node.child_nodes["right"]# 往右边
        else:
            # 如果特征缺失则无法进行预测？？？
            goto = node.child_nodes[feature_value]
            
        return self._search_class(x, goto, is_linear)
        
    


In [3]:
def accuracy(y_pred, y_true):
    return np.sum(y_pred == y_true) / len(y_pred)

In [16]:
# 测试
X_train = np.array([
       [2, 1, 1, 1, 1, 1],
       [1, 1, 2, 1, 1, 1],
       [1, 1, 1, 1, 1, 1],
       [2, 1, 2, 1, 1, 1],
       [3, 1, 1, 1, 1, 1],
       [2, 2, 1, 1, 2, 2],
       [1, 2, 1, 2, 2, 2],
       [1, 2, 1, 1, 2, 1],
       [1, 2, 2, 2, 2, 1],
       [2, 3, 3, 1, 3, 2],
       [3, 3, 3, 3, 3, 1],
       [3, 1, 1, 3, 3, 2],
       [2, 2, 1, 2, 1, 1],
       [3, 2, 2, 2, 1, 1],
       [1, 2, 1, 1, 2, 2],
       [3, 1, 1, 3, 3, 1],
       [2, 1, 2, 2, 2, 1]])
y_train = np.array([1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0])
weight = np.ones(len(X_train))
model = DecisionTree()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
accuracy(y_pred, y_train)

1.0

In [18]:
X = np.array([
       [np.nan, 1, 1, 1, 1, 1],
       [1, 1, 2, 1, 1, np.nan],
       [1, 1, np.nan, 1, 1, 1],
       [2, 1, 2, 1, 1, 1],
       [np.nan, 1, 1, 1, 1, 1],
       [2, 2, 1, 1, np.nan, 2],
       [1, 2, 1, 2, 2, 2],
       [1, 2, 1, np.nan, 2, 1],
       [1, np.nan, 2, 2, 2, 1],
       [2, 3, 3, np.nan, 3, 2],
       [3, 3, 3, 3, 3, np.nan],
       [3, 1, np.nan, 3, 3, 2],
       [np.nan, 2, 1, 2, 1, 1],
       [3, 2, 2, 2, 1, 1],
       [1, 2, 1, 1, np.nan, 2],
       [3, 1, 1, 3, 3, 1],
       [2, np.nan, 2, 2, 2, 1]])
y = np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
model = DecisionTree()
model.fit(X,y)
# ！！！ 注意实际上由于特征缺失，上述样本无法用于预测
try:
    y_pred = model.predict(X)
except Exception as e:
    print(e)
accuracy(y_pred, y)

nan


1.0

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
data = load_iris()
X,y = data.data, data.target
X_train,test_X,y_train,test_y = train_test_split(X,y,test_size=0.3)


In [61]:
# 不输入参数——不进行预剪枝
model = DecisionTree()
model.fit(X_train, y_train, is_linear=True)
y_pred = model.predict(test_X, is_linear=True)
accuracy(y_pred, test_y)

0.9555555555555556

In [62]:
# 输入参数——预剪枝
model = DecisionTree(max_depth=6, min_sample_split=6, min_info_gain_rate=0.1)
model.fit(X_train,y_train, is_linear=True)
y_pred = model.predict(test_X, is_linear=True)
accuracy(y_pred,test_y)

0.9555555555555556

## C4.5后剪枝

### Cost complexity pruning—also known as weakest link pruning

![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)

**在cart中使用的是这个方法**

### reduced error  pruning

将数据集分为两部分，一部分是训练集，用来建树，一部分是验证集或者称剪枝集，用来剪枝。
![image.png](attachment:image.png)

REP剪枝的优点在于原理简单，而且和原始决策树相比，修剪后的决策树对未来新事例的预测偏差较小。不过缺点同样是很明显的，首先需要单独的验证集进行剪枝。当数据集比较大的时候，这或许不是问题，但当数据集比较小的时候，将数据集一分为二意味着训练集变小，影响树的构建。

### minimum error pruning

是一种自底向上的剪枝策略，但是基于训练样本集，无需独立验证集来剪枝
![image.png](attachment:image.png)


### Pessimistic Error Pruning

下述代码使用Cost complexity pruning，也就是与cart相同

![image.png](attachment:image.png)

根据公式，结点状态Node中要增加样本量

In [None]:
 
    
    def prune(self, alpha=0):
        """
        对决策树进行后剪枝：决策树C4.5剪枝 C(T)+alpha*|T|
        
        """
        return self._pruning_node(self.root_node, alpha)
        
    def _pruning_node(self, node,alpha):
        """
        :param node : 当前处理的节点
        :param alpha: loss的参数，alpha≥0

        """
        ## 递归基:当前节点是叶子节点则直接返回
        if node.is_leaf:
            return 
        ## 让递归函数帮忙处理子树
        for child in node.child_nodes.values():# 多叉树
            self._pruning_node(child, alpha)

        ## 处理当前节点
        # 剪枝后
        post_loss = node.n_sample * node.entropy + alpha * 1 
        # 剪枝前
        pre_loss = alpha * len(node.child_nodes)
        for child in node.child_nodes.values():
            pre_loss += child.n_sample * child.entropy
        # 比较剪枝前的loss与剪枝后的loss
        if post_loss < pre_loss: # 剪枝后loss更小则剪枝（收回左右结点）
            node.child_nodes = None
            node.best_feature_i = None
            node.best_split_point = None
            node.is_leaf = True

In [61]:
a = {1:2, "left":3}
a.values()
for v in a.values():
    print(v)

2
3


In [63]:
class Node(object):
    """通过树结点的各属性记录生成的树结构"""
    def __init__(self,
                 best_feature_i=None, 
                 best_split_point=None,
                 child_nodes = None,
                 leaf_class = None,
                 is_leaf=False,
                 entropy=None, 
                 n_sample=None):
        """
        每个当前结点Node都记录了当前的划分状况
        :param child_nodes : 结点的所有子结点，字典存储
        :param best_feature_i : 当前结点的最佳划分特征
        :param best_split_point : 当前结点的最佳特征对应的最佳分割点
        :param leaf_class : 记录当前节点所属的类别
        :param is_leaf : 只有在is_leaf==True时，leaf_class才生效
        :param entropy : 当前节点的entropy
        
        """
        self.best_feature_i = best_feature_i
        self.best_split_point = best_split_point
        self.child_nodes = child_nodes
        self.leaf_class = leaf_class
        self.is_leaf = is_leaf
        self.entropy = entropy
        self.n_sample = n_sample
        
class DecisionTree():
    """使用cart算法构建决策树"""
    
    def __init__(self, max_depth = float("inf"),min_sample_split=2, min_info_gain_rate=None):
        # 代表决策树的决策树根节点
        self.root_node = None 
        # 预设的决策树最大深度
        self.max_depth = max_depth
        # 预设的决策树叶子节点最小样本数
        self.min_sample_split = min_sample_split
        # 预设的基尼系数增益的最小值（gini_gain太小时不划分）
        self.min_info_gain_rate  = min_info_gain_rate 
        
    def fit(self, X,y,is_linear=False):
        """
        决策树拟合
        :param X : 训练数据集∈（m,n）
        :param y : 训练标签∈（n,1）
        :param is_linear : 特征是否为连续型
        
        """
        # 创建决策树根结点
        self.root_node = Node()
        # 默认根节点的深度为1
        cur_depth = 1
        # 根节点的初始化权重
        # 样本的初始权重:都为1
        weight = np.ones((len(X))) # 全局的weight:初始化为全1 
        # 递归构建决策树
        self._build_tree_recussive(X,y,np.arange(len(X)),weight,self.root_node, cur_depth, is_linear)
    
    def _build_tree_recussive(self, X,y, node_indices,weight,node:Node, cur_depth, is_linear):
        """
        对于当前节点集合（X，y）-node_indices,递归建立决策树
        :param X: 所有样本
        :param y: 所有标签
        :param node_indices : 当前样本集合对应的索引
        :param weight : 所有样本对应的权重
        :param node : 当前结点的状态记录

        """
        n_samples,n_features = len(node_indices), X.shape[1]
        # 记录本节点的状态
        node.entropy = self._ent(y[node_indices], weight[node_indices])
        node.leaf_class = self._majority_vote(y[node_indices], weight[node_indices])
        node.n_sample = n_samples
        ## 递归基
        # 节点包含数据属于同一个类别，此时无需划分
        if len(np.unique(y[node_indices])) <= 1:
            # 记录叶子结点所属的分类
            node.is_leaf = True
            return
        # 没有更多特征(当前节点所含样本所有特征都只有一个取值)
        if np.sum([len(np.unique(X[node_indices][:,i])) for i in range(n_features)]) == n_features:
            node.is_leaf = True
            return
        # 限制构建子树的深度
        if cur_depth >= self.max_depth:
            node.is_leaf = True
            return
        # 限制节点的最小样本量
        if n_samples < self.min_sample_split:
            node.is_leaf = True
            return

        ## 处理当前节点自身(X,y, node_indices,weight)
        # 找到最佳特征和对应的info_gain_rate    
        best_info_gain_rate, best_sets = self._get_best_split_feature(X, y, weight, node_indices, is_linear)
        """
        最佳划分对应的信息，best_sets = {
                "best_feature_i": best_feature_i,
                "best_split_point": split_point,# 只有连续特征才用得到
                "nonan_indices": nonan_indices,
                "nan_indices": nan_indices,
            }
        
        """
        
        ## 使用找到的最佳结点进行真正划分（整合已缺失样本）
        
        # 划分前先判断：信息增益率的最小值（info_gain_rate太小时不划分）
        if self.min_info_gain_rate is not None and  best_info_gain_rate < self.min_info_gain_rate:
            node.is_leaf = True
            return
        
        # 基于最佳特征进行决策树划分
        nonan_indices, nan_indices = best_sets["nonan_indices"], best_sets["nan_indices"]
        best_feature_i, best_split_point = best_sets["best_feature_i"], best_sets["best_split_point"]
        
        # 记录本节点的状态
        node.best_feature_i = best_feature_i
        node.best_split_point = best_split_point
        node.child_nodes = {} # 记录节点的子节点
        # --leaf_class和info_entropy在递归基时记录，node在划分时记录
        
        # 连续/离散特征采用不同方式进行划分
        if is_linear and best_split_point is not None: # 连续特征使用best_split_point进行二叉划分
            # 找到了最佳划分点,进行真正的划分（取对应的indices）
            left_indices = [i for i in node_indices if X[i, best_feature_i] <= best_split_point]
            right_indices = [i for i in node_indices if ~(X[i, best_feature_i] <= best_split_point)]
            # 整合已缺失样本
            left_indices.extend(nan_indices)
            right_indices.extend(nan_indices)
            # 修改权重,将缺失样本按不同的权重放到两个分支中
            left_weight, right_weight = np.zeros_like(weight),np.zeros_like(weight)
            left_weight[left_indices], right_weight[right_indices] = weight[left_indices], weight[right_indices]
            left_weight[nan_indices], right_weight[nan_indices] = np.sum(weight[left_indices]) / np.sum(weight[nonan_indices]),np.sum(weight[right_indices]) / np.sum(weight[nonan_indices])
            # ---------------左右分支递归建树--------------------
            # 记录本节点的状态
            node.child_nodes["left"] = Node()
            node.child_nodes["right"] = Node()
            # 让递归函数帮忙建立左右子树
            self._build_tree_recussive(X,y,left_indices,left_weight, node.child_nodes["left"], cur_depth+1, is_linear)
            self._build_tree_recussive(X,y,right_indices,right_weight, node.child_nodes["right"], cur_depth+1, is_linear)

        else:# 离散特征直接使用所有划分点进行多叉划分
            split_points = np.unique(X[:, best_feature_i])
            for point in split_points:
                # 对应的indices
                cur_idx = [i for i in node_indices if X[i, best_feature_i] == point]
                # 修改权重,将缺失样本按不同的权重放到多个分支中
                cur_weight = np.zeros_like(weight)
                cur_weight[cur_idx] = weight[cur_idx]
                cur_weight[nan_indices] = np.sum(weight[cur_idx])/np.sum(weight[nonan_indices])
                # ---------------多分支递归建树------------------
                # 记录本节点的状态
                node.child_nodes[point] = Node()
                # 让递归函数帮忙
                self._build_tree_recussive(X,y, cur_idx, cur_weight, node.child_nodes[point], cur_depth + 1, is_linear)

        
                                   
    def _get_best_split_feature(self, X,y, weight, node_indices, is_linear):
        """
        找到最佳特征完成特征选择
        :param X : 所有样本
        :param y : 所有样本对应的标签
        :param weight : 所有样本对应的权重
        :param node_indices : 当前样本对应的索引
        :param is_linear : 所有特征的属性是连续型还是离散型

        """

        # 初始化
        best_info_gain_rate = 0
        best_sets = None
        # 特征选择
        n_features = X.shape[1]
        for feature_i in range(n_features):
            ## 找出未缺失的样本
            nonan_indices = [i for i in node_indices if ~np.isnan(X[i, feature_i])]
            # 找出缺失样本
            nan_indices = [i for i in node_indices if np.isnan(X[i, feature_i])]
            ## 进行特征选择时使用信息增益率（info_gain_rate）
            # 1.求本特征的信息熵
            cur_entropy, split_point = self._get_feature_entropy(X[nonan_indices], y[nonan_indices], weight[nonan_indices],feature_i, is_linear)
            # 2.求信息增益info_gain_rate:(划分前的信息熵 - 划分后的信息熵)/本特征的熵
            # 求本特征的熵（注意分母可能为0）
            IV = self._ent(X[nonan_indices][:, feature_i])
            if IV != 0:
                cur_info_gini_rate = (self._ent(y[nonan_indices], weight[nonan_indices]) - cur_entropy)/ IV
            else:
                cur_info_gini_rate = 0
            # 3.计算所有特征的加权info_gini_rate,找到最佳特征
            # 无缺失值样本所占的比例:对每一个样本赋予了权重后,利用权重计算无缺失样本所占的比例
            lou = np.sum(weight[nonan_indices]) / np.sum(weight[node_indices])
            cur_info_gini_rate = lou * cur_info_gini_rate

            # 找到最佳特征
            if cur_info_gini_rate >= best_info_gain_rate:
                best_info_gain_rate = cur_info_gini_rate
                best_sets = {
                    "best_feature_i": feature_i,
                    "best_split_point": split_point,# 只有连续特征才用得到
                    "nonan_indices": nonan_indices,
                    "nan_indices": nan_indices,
                }

        return best_info_gain_rate, best_sets
    
    def _get_feature_entropy(self, X, y, weight, feature_i, is_linear):
        """
        计算特征i的entropy
        :param X :当前结点考虑的样本（无缺失）
        :param y : 当前结点考虑的样本对应标签（无缺失） 
        :param feature_i : 当前欲计算的特征
        :param is_linear : 特征i是否连续型
        :return 特征i的entropy,如果是连续特征，还会返回特征对应的最佳切分点

        """
        # 初始化
        entropy = 0
        best_split_point = None # 只有连续特征才用得到

        # 1、确定特征i的所有可能取值
        feature_values = np.unique(X[:, feature_i])# 已排序的
        if is_linear:
            # 特征是连续型特征则使用二分法找到所有的切分点
            split_points = (feature_values[1:] + feature_values[:-1]) / 2 
            # 连续型feature_i需要找到最佳分割点代表它
            best_split_entropy = 999
            for point in split_points:
                cond = (X[:, feature_i] <= point)
                left_entropy = (np.sum(weight[cond])/np.sum(weight))*self._ent(y[cond],weight[cond])
                right_entropy= (np.sum(weight[~cond])/np.sum(weight))*self._ent(y[~cond], weight[~cond])
                cur_entropy = left_entropy + right_entropy
                if cur_entropy <= best_split_entropy:
                    best_split_entropy = cur_entropy
                    best_split_point = point
            # 此时best split_point,best_entropy就代表了feature_i的entropy
            entropy = best_split_entropy

        else:
            # 离散型特征直接使用特征的各个取值作为切分点
            split_points = feature_values
            for point in split_points:
                cond = (X[:, feature_i] == point)
                cur_entropy = (np.sum(weight[cond])/np.sum(weight)) * ent(y[cond], weight[cond])
                entropy += cur_entropy
        return entropy,best_split_point    
    
     
    
    def _ent(self, x, weight=None):
        """
        计算特征的熵/标签的熵
        :param x :数据集的某一个特征的全部对应取值（特征熵）或者标签的熵
        :param weight :每一个样本对应的权重,
        :return 特征的熵/标签的熵

        """
        # weight非空时计算的是标签的熵
        if weight is None:# 说明计算的是特征熵，将weight初始化为全1
            weight = np.ones(len(x))

        unique_x = np.unique(x)
        weight_sum = np.sum(weight)
        p_x = [np.sum(weight[x==a])/weight_sum for a in unique_x]
        return np.sum([-p * np.log2(p) for  p in p_x])
    
    def _majority_vote(self, y,weight):
        """
        根据多数原则定叶子节点所属的分类
        :param y : 当前集合对应的标签
        :param weight : 前集合对应的权重
        :return most_common_class叶子节点所属的分类
        """
        most_common_class = None
        max_distribution = 0
        for k in np.unique(y): 
            distribution = np.sum(weight[y==k])
            if distribution >= max_distribution:
                max_distribution = distribution
                most_common_class = k
        return most_common_class
    
   
    def predict(self,X,is_linear=False):
        """
        :param X: 待预测的m个样本

        """
        # 每一个样本都通过多叉搜索决策树树查找所属类别,决策树由其根节点作为代表
        y_pred = [self._search_class(x, self.root_node, is_linear) for x in X]
        return y_pred

    def _search_class(self, x, node:Node, is_linear=False):
        """
        : param x: 待预测所属分类的样本
        : param node : 当前所在节点

        """
        # 递归基
        if node.is_leaf:# 已经走到叶子
            return node.leaf_class
        ## 当前节点的工作
        # 本样本最终要往哪个分支走
        goto = None
        # 根据当前节点的最佳特征及最佳切分点决定x是继续往左边走还是往右边走
        feature_value = x[node.best_feature_i]
        # 离散型/连续型特征处理不同
        if is_linear and node.best_split_point is not None:
            if feature_value <= node.best_split_point:
                goto = node.child_nodes["left"]# 往左边
            else:
                goto = node.child_nodes["right"]# 往右边
        else:
            # 如果特征缺失则无法进行预测？？？
            goto = node.child_nodes[feature_value]
            
        return self._search_class(x, goto, is_linear)
        
    def prune(self, alpha=0):
        """
        对决策树进行后剪枝：决策树C4.5剪枝 C(T)+alpha*|T|
        
        """
        return self._pruning_node(self.root_node, alpha)
        
    def _pruning_node(self, node, alpha):
        """
        :param node : 当前处理的节点
        :param alpha: loss的参数，alpha≥0

        """
        ## 递归基:当前节点是叶子节点则直接返回
        if node.is_leaf:
            return 
        ## 让递归函数帮忙处理子树
        for child in node.child_nodes.values():# 多叉树
            self._pruning_node(child, alpha)

        ## 处理当前节点
        # 剪枝后
        post_loss = node.n_sample * node.entropy + alpha * 1 
        # 剪枝前
        pre_loss = alpha * len(node.child_nodes)
        for child in node.child_nodes.values():
            pre_loss += child.n_sample * child.entropy
        # 比较剪枝前的loss与剪枝后的loss
        if post_loss < pre_loss: # 剪枝后loss更小则剪枝（收回左右结点）
            node.child_nodes = None
            node.best_feature_i = None
            node.best_split_point = None
            node.is_leaf = True


In [75]:
# 不输入参数——不进行预剪枝
model = DecisionTree()
model.fit(X_train, y_train, is_linear=True)
y_pred = model.predict(test_X, is_linear=True)
print(accuracy(y_pred, test_y))
# 后剪枝
model.prune(10)
y_pred = model.predict(test_X,is_linear=True)
print(accuracy(y_pred,test_y))

0.9555555555555556
0.9333333333333333
