In [1]:
# 二分类问题
# 算法思路 
##func1. 数据准备
##fucn2. 计算误差率函数
##func3. 权值更新函数  直接放在
##func4. singleTree弱分类器
##func5. train()函数
##func6. predict()预测函数
##func7. score()
import time
import numpy as np
import pandas as pd
import random
import math


from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [5]:
iris = load_iris() 
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = [
    'speal length', 'speal width', 'petal legth', 'petal width', 'label'
]
df[df == 0] = -1
DataArr = np.array(df.iloc[:100, [0,1,-1]])
XArr, YArr = DataArr[:, :2], DataArr[:, -1]

def shuffle(X, Y):
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    
    return X[randomize], Y[randomize]

XArr, YArr = shuffle(XArr, YArr)
X_TrainArr, X_TestArr, Y_TrainArr, Y_TestArr = train_test_split(XArr, YArr)

In [None]:
# 解决的问题：二分类，Y_label = {-1, 1} 这里简化特征X Xi = {0,1} (二值化处理)

# 算法实现思路
## 计算分类错误率，输入应该是：数据集和训练好的模弱分类器参数，返回预测结果和分类误差率
## 单层提升树： 输入：数据，权值分布 输出：创建的单层提升树
## 生成提升树：输入：数据， 输出：提升树
## 预测结果

def calc_e_Gx(X_trainArr, Y_trainArr, n, div, D):
    '''
    计算分类误差率
    n，要操作的特征
    div，划分的点
    D, 权值分布
    return 预测结果, 分类误差率
    '''
    # 初始化误差率
    e = 0 
    # 单独提取X, Y
    X_n = X_trainArr[:, n]
    Y_n = Y_trainArr
    predict = []
    
    for i in range(len(X_n)):
        if X_n[i] < div:
            predict[i] = -1
            if predict[i] != Y_train[i]: e += D[i]
        else:
            predict[i] = 1
            if predict[i] != Y_train[i]: e += D[i]
                
    return np.array(predict), e
    
def creatSingleTree(X_trainArr, Y_trainArr, D):
    # 该函数可以用其他弱分类器代替
    # 获得样本数目及特征数量
    m, n = np.shape(X_trainArr)
    
    # 单层树字典，用于存放当前提升树的参数，包括：分割点，预测结果，误差率(由预测结果计算)，
    # 该单层树所处理的特征 
    singleBoostTree = {} 
    # 初始化误差率，最大为100%
    singleBoostTree['e'] = 1
    
    # 遍历每一个特征，寻找用于划分的最合适的特征
    for i in range(n):
        # 由于特征进行了二值化处理，只能为0、1，因此切分点为 -0.5， 0.5，1.5
        for div in [-0.5, 0.5, 1.5]:
            Gx, e = calc_e_Gx(X_trainArr, Y_trainArr, i, div, D)
            if e < singleBoostTree['e']:
                singleBoostTree['e'] = e
                singleBoostTree['div'] = div
                singleBoostTree['Gx'] = Gx
                singleBoostTree['feature'] = i
    return singleBoostTree

def creatBoostingTree(X_trainArr, Y_trainArr, treeNum = 50):
    '''
    treeNum: 弱分类器的数目作为一个超参数，可以通过交叉验证挑选一个最好的
    return: 提升树
    '''
    m, n = np.shape(X_trainArr)
    
    # 初始化权值分布
    D = np.array([1 / m] * m)
    # 初始化树列表
    iterationNum = 0
    tree = []
    
    for i in range(treeNum):
        # 创建当层的提升树
        iterationNum += 1
        curTree = singleBoostTree(X_trainArr, Y_trainArr, D)
        # 计算alpha
        alpha = 1 / 2 * np.log((1 - curTree['e']) / curTree['e'])
        Gx = curTree['Gx']
        D = np.multiply(D, np.exp( -alpha * np.multiply(Y_trainArr, Gx))) / \
            sum(np.multiply(D, np.exp( -alpha * np.multiply(Y_trainArr, Gx))))
        curTree['alpha'] = alpha
        tree.append(curTree)
        
        # 当前训练集预测结果
        finalpredict += alpha * Gx
        # 当前预测误差数目
        error_count = 0
        for i in range(len(Y_trainArr)):
            if np.sign(finalpredict)[i] != Y_trainArr[i]:
                error_count +=  1
        
        error_rate = error_count / len(Y_trainArr)
        
        # 如果误差已经为0了，那么就可以停止了不用再计算了
        if error_rate == 0: return tree
    print('Numbers of iteration: {}, \n error rate: {}'.format(iterationNum, error_rate))
    return tree

def Gx_predict(x, div, feature):
    if x[feature] < div:
        return -1
    else:
        return 1

def model_predict(X_testArr, tree):
    prediction = []
    # 每一层的tree 有div，alpha，feature
    for i in range(len(X_testArr)):
        result = 0
        for curTree in tree:
            div = curTree['div']
            alpha = curTree['alpha']
            feature = curTree['feature']
            result += alpha * Gx_predict(X_testArr[i], div, feature)
        prediction.append(result)
    
    return prediction

def model_score(X_testArr, Y_testArr, tree):
    prediction = model_predict(X_testArr, tree)
    error_count = 0
    for i in range(len(Y_testArr)):
        if prediction[i] != Y_testArr[i]:
            error_count += 1
    score = 1 - (error_count / len(Y_testArr))
    
    return score
## 没有数据不能测试