# 利用AdaBoost集成方法提高分类性能

In [1]:
from numpy import *

def loadSimpleData():
    '''一个简单数据集'''
    datMat = matrix([[1. , 2.1],
                     [1.5, 1.6],
                     [1.3, 1. ],
                     [1. , 1. ],
                     [2. , 1. ]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat, classLabels

datMat, classLabels = loadSimpleData()

## 单层决策树生成函数

In [2]:
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    '''对dataMatrix进行分类，dimen维数，threshVal阈值，threshIneq基准'''
    retArray = ones((shape(dataMatrix)[0], 1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray

def buildStump(dataArr, classLabels, D):
    '''构建一个简单的单层决策树'''
    dataMatrix = mat(dataArr); labelMat = mat(classLabels).T
    m, n = shape(dataMatrix)
    numSteps = 10.0; bestStump = {}; bestClassEst = mat(zeros((m, 1)))
    minError = inf
    for i in xrange(n):
        # 不同特征
        rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max();
        stepSize = (rangeMax - rangeMin) / numSteps
        for j in xrange(-1, int(numSteps)+1):
            # 不同阈值
            for inequal in ['lt', 'gt']:
                # 不同基准
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = mat(ones((m, 1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T * errArr
                #print("split: dim %d, thresh %.2f, ineqal: %s, weighted error: %.3f" %\
                #      (i, threshVal, inequal, weightedError))
                if weightedError < minError:
                    minError = weightedError
                    bestClassEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClassEst

In [3]:
D = mat(ones((5, 1))/5)
bestStump, minError, bestClassEst = buildStump(datMat, classLabels, D)
print(bestStump, minError, bestClassEst)

({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, matrix([[ 0.2]]), array([[-1.],
       [ 1.],
       [-1.],
       [-1.],
       [ 1.]]))


## AdaBoost算法
对每次迭代：

    利用buildStump()函数找到最佳的单层决策树
    计算alpha
    将单层最佳决策树加入单层决策树数组
    计算新的权重向量D
    更新累计类别估计值
    如果错误率等于0.0，则退出循环

In [4]:
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    weakClassArr = []
    m = shape(dataArr)[0]
    D = mat(ones((m,1))/m)
    aggClassEst = mat(zeros((m,1)))
    for i in xrange(numIt):
        # 利用buildStump()函数找到最佳的单层决策树
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        print("D:", D.T)
        # 计算alpha
        alpha = float(0.5*log((1.0-error)/max(error,1e-16)))
        # 将单层最佳决策树加入单层决策树数组
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        print("ClassEst: ", classEst.T)
        # 计算新的权重向量D
        expon = multiply(-1*alpha*mat(classLabels).T, classEst)
        D = multiply(D, exp(expon))
        D = D / D.sum()
        # 更新累计类别估计值
        aggClassEst += alpha*classEst
        print("aggClassEst: ", aggClassEst.T)
        aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m,1)))
        errorRate = aggErrors.sum()/m
        print("total error: ", errorRate, "\n")
        # 如果错误率等于0.0，则退出循环
        if errorRate == 0.0: break
    return weakClassArr

In [5]:
classifierArray = adaBoostTrainDS(datMat, classLabels, 9)
print(classifierArray)

('D:', matrix([[ 0.2,  0.2,  0.2,  0.2,  0.2]]))
('ClassEst: ', array([[-1.,  1., -1., -1.,  1.]]))
('aggClassEst: ', matrix([[-0.69314718,  0.69314718, -0.69314718, -0.69314718,  0.69314718]]))
('total error: ', 0.20000000000000001, '\n')
('D:', matrix([[ 0.5  ,  0.125,  0.125,  0.125,  0.125]]))
('ClassEst: ', array([[ 1.,  1., -1., -1., -1.]]))
('aggClassEst: ', matrix([[ 0.27980789,  1.66610226, -1.66610226, -1.66610226, -0.27980789]]))
('total error: ', 0.20000000000000001, '\n')
('D:', matrix([[ 0.28571429,  0.07142857,  0.07142857,  0.07142857,  0.5       ]]))
('ClassEst: ', array([[ 1.,  1.,  1.,  1.,  1.]]))
('aggClassEst: ', matrix([[ 1.17568763,  2.56198199, -0.77022252, -0.77022252,  0.61607184]]))
('total error: ', 0.0, '\n')
[{'dim': 0, 'ineq': 'lt', 'thresh': 1.3, 'alpha': 0.6931471805599453}, {'dim': 1, 'ineq': 'lt', 'thresh': 1.0, 'alpha': 0.9729550745276565}, {'dim': 0, 'ineq': 'lt', 'thresh': 0.90000000000000002, 'alpha': 0.8958797346140273}]


## 测试算法

In [6]:
def adaClassify(datToClass, classifierArr):
    '''利用训练出的多个弱分类器进行分类'''
    dataMatrix = mat(datToClass)
    m = shape(dataMatrix)[0]
    aggClassEst = mat(zeros((m, 1)))
    for i in xrange(len(classifierArr)):
        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'],
                                 classifierArr[i]['thresh'], classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
        print(aggClassEst)
    return sign(aggClassEst)

In [7]:
print(adaClassify([[5,5],[0,0]], classifierArray))

[[ 0.69314718]
 [-0.69314718]]
[[ 1.66610226]
 [-1.66610226]]
[[ 2.56198199]
 [-2.56198199]]
[[ 1.]
 [-1.]]
