In [2]:
#adaptive boosting:是一种集成方法，通过组合多个弱分类器的分类结果，进行加权求和的分类结果

import numpy as np
import matplotlib.pyplot as plt

def loadSimpData():
     datMat = np.matrix(
        [[ 1. ,  2.1],
        [ 2. ,  1.1],
        [ 1.3,  1. ],
        [ 1. ,  1. ],
        [ 2. ,  1. ]])
     classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
     return datMat, classLabels

datMat, classLabels = loadSimpData()

xcord0 = []
ycord0 = []
xcord1 = []
ycord1 = []
markers =[]
colors =[]

for i in range(len(classLabels)):
    if  classLabels[i] == 1.0:
        xcord1.append(datMat[i, 0]), ycord1.append(datMat[i, 1])
    else:
        xcord0.append(datMat[i, 0]), ycord0.append(datMat[i, 1])
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(xcord0, ycord0, marker='s', s=90)
ax.scatter(xcord1, ycord1, marker='o', s=50, c='red')
plt.xlabel('x-axis')
plt.ylabel('y-axis')
plt.title('my test data')
plt.show()

In [3]:
#分类函数
#(数据集，特征，阈值，阈值判定方法)
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
    retArray = np.ones((np.shape(dataMatrix)[0], 1))
    if threshIneq == 'lt':
        retArray[dataMatrix[:,dimen] <= threshVal] = -1.0
    else:
        retArray[dataMatrix[:,dimen] > threshVal] = -1.0
    return retArray
#构建单层决策树（decision stump决策树桩）
#(数据集，分类标签y,样本数据的权重向量D)
def buildStump(dataArr, classLabels, D):
    dataMatrix = np.mat(dataArr)
    labelMat = np.mat(classLabels).T
    m, n = np.shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClassEst = np.mat(np.zeros((m, 1)))
    minError = np.inf
    for i in range(n):
        rangeMin = dataMatrix[:,i].min()
        rangeMax = dataMatrix[:,i].max()
        stepSize = (rangeMax - rangeMin)
        for j in range(-2, int(numSteps) + 1):
            for inequal in ['lt', 'gt']:
                threshVal = (rangeMin + float(j) * stepSize)
                #这里调用预测分类函数
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = np.mat(np.ones((m, 1)))
                errArr[predictedVals == labelMat] = 0
                weightedError = D.T*errArr
                if weightedError < minError:
                    minError = weightedError
                    bestClassEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClassEst

In [4]:
D = np.mat(np.ones((5,1)) / 5.0)
bestStump, minError, bestClassEst = buildStump(datMat, classLabels, D)

In [5]:
print(bestStump)
print(minError)
print(bestClassEst)

{'dim': 1, 'thresh': 1.0, 'ineq': 'lt'}
[[0.2]]
[[ 1.]
 [ 1.]
 [-1.]
 [-1.]
 [-1.]]


In [6]:
#单层决策树的训练过程 ：p117的公式
#首先，基于样本的权重向量D（开始每个样本都是相同值）,
# 然后训练一个弱分类器（单层决策树），并且得到分类的错误率（加权）和该分类器的权重值alpha
#接着迭代：在同一个数据集中，调整D（分对的，权重降低，分错的，权重提高），再训练得到分类器\错误率\alpha
#最后得到所有分类器的加权结果:sum(alpha[i]*y[i])
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
    weakClassArr = []
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m,1))/m)
    aggClassEst=np.mat(np.zeros((m,1)))
    errorRate=0.0
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D)
        alpha = float(0.5 * np.log((1.0 -error)/max(error,1e-16)))
        bestStump['alpha'] = alpha
        weakClassArr.append(bestStump)
        expon = np.multiply(-1*alpha*np.mat(classLabels).T, classEst)
        D = np.multiply(D, np.exp(expon))
        D = D/D.sum()
        aggClassEst += alpha*classEst
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1)))
        errorRate = aggErrors.sum()/m
        if errorRate ==0.0:
            break
    return weakClassArr, aggClassEst, errorRate

In [7]:
weakClassArr,aggClassEst,errorRate = adaBoostTrainDS(datMat,classLabels)

In [8]:
#adaBoost分类函数
#（数据集，训练好的多个分类器）
#利用训练好的多个弱分类器，进行加权分类
def adaClassify(dataToClass, classifierArr):
    dataMatrix = np.mat(dataToClass)
    m = np.shape(dataMatrix)[0]
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(len(classifierArr)):
        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'],
                                 classifierArr[i]['thresh'],
                                 classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha']*classEst
    return np.sign(aggClassEst)

In [9]:
pred = adaClassify([[5,5],[0,0]], weakClassArr)
print(pred)

[[ 1.]
 [-1.]]


In [12]:
#用一个难数据集，测试一下
def loadDataSet(fileName):
    numFeat = len(open(fileName).readline().split('\t'))
    dataMat = [];labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
        for i in range(numFeat - 1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat, labelMat

In [13]:
#训练
trainArr, trainLabel = loadDataSet('MachineLearningInAction/AdaBoost/horseColicTraining2.txt')
weakClassArr1, aggClassEst1, errorRate1 = adaBoostTrainDS(trainArr, trainLabel, 10)

In [14]:
testArr, testLabel = loadDataSet('MachineLearningInAction/AdaBoost/horseColicTest2.txt')
pred1 = adaClassify(testArr, weakClassArr1)
testError = np.mean(pred1.A.ravel()!=np.array(testLabel))
print(testError)

0.35451505016722407


In [18]:
#来看看分类器的个数的增加，算法的训练和测试效果
#这里稍微改了一下adaBoostTrainDS(),最后一行多一个返回errorRate,注释掉一些输出
for iters in [1, 10, 30, 50, 100]:
    weakClassArr, aggClassEst, errorRate = adaBoostTrainDS(trainArr,trainLabel,iters)
    pred = adaClassify(testArr, weakClassArr)
    testError=np.mean(pred.A.ravel()!=np.array(testLabel))
    print('at iters ', iters, ' train error=', errorRate, 'test error=', testError)

at iters  1  train error= 0.3779264214046823 test error= 0.35451505016722407


at iters  10  train error= 0.35451505016722407 test error= 0.35451505016722407


at iters  30  train error= 0.33444816053511706 test error= 0.35451505016722407


at iters  50  train error= 0.31438127090301005 test error= 0.35451505016722407


at iters  100  train error= 0.3076923076923077 test error= 0.35451505016722407


In [22]:
#ROC曲线绘制
def plotROC(predStrengths, classLabels):
    import matplotlib.pyplot as plt
    cur = (1.0,1.0)
    ySum = 0.0
    numPosClas = sum(np.array(classLabels) == 1.0)
    yStep = 1/float(numPosClas)
    xStep = 1/float(len(classLabels) - numPosClas)
    sortedIndicies = predStrengths.argsort()
    fig = plt.figure()
    fig.clf()
    ax = plt.subplot(1,1,1)
    for index in sortedIndicies.tolist()[0]:
        if classLabels[index] == 1.0:
            delX = 0; delY = yStep
        else:
            delX = xStep; delY = 0
            ySum+=cur[1]
        ax.plot([cur[0], cur[0]-delX],[cur[1],cur[1]-delY], c='b')
        cur = (cur[0] -delX, cur[1]-delY)
    ax.plot([0,1],[0,1],'b--')
    plt.xlabel('假正例');plt.ylabel('真正例')
    ax.axis([0,1,0,1])
    plt.show()
    print('AUC is:', ySum * xStep)

In [23]:
trainArr, trainLabel = loadDataSet('MachineLearningInAction/AdaBoost/horseColicTraining2.txt')
classifierArray, aggClassEst, tmp = adaBoostTrainDS(trainArr, trainLabel,10)
plotROC(aggClassEst.T, trainLabel)

AUC is: 0.6976042343764506
