In [3]:
import numpy as np;

def loadSimpleData():
    datMat = np.matrix([[1. , 2.1],
                        [2. , 1.1],
                        [1.3, 1. ],
                        [1. , 1. ],
                        [2. , 1. ]])
    classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
    return datMat, classLabels
    
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq): #数据，第几列特征，阈值，大于或小于
    retArray = np.ones((np.shape(dataMatrix)[0], 1)) 
    if threshIneq == 'lessthan': #将小于阈值的点分为-1类
        retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
    else:  #将大于阈值的点分为-1类
        retArray[dataMatrix[:, dimen] > threshVal] = -1.0
    return retArray

def buildStump(dataArr, classLabels, D):#D为数据初始权重数组
    dataMatrix = np.mat(dataArr)
    labelMat = np.mat(classLabels).T
    m, n = np.shape(dataMatrix)
    numSteps = 10.0
    bestStump = {}
    bestClasEst = np.mat(np.zeros((m, 1)))
    minError = float("inf")#float(inf)为正无穷
    for i in range(n): #针对数据的第i列数据分类，即针对数据的第i个特征
        rangeMin = dataMatrix[:, i].min()#取出第i数据最小值
        rangeMax = dataMatrix[:, i].max()#取出第i数据最大值
        stepSize = (rangeMax - rangeMin) / numSteps #得到步长
        for j in range(-1, int(numSteps) + 1): #第一层循环阈值逐渐按步长增加
            for inequal in ['lessthan', 'morethan']: 
                threshVal = (rangeMin + float(j) * stepSize)
                predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
                errArr = np.mat(np.ones((m, 1)))
                errArr[predictedVals == labelMat] = 0 #错误数组，元素中如果分对就为0，不对为1
                weightedError = D.T * errArr #权重乘以错误数组可得加权错误率
                print("split : dim %d, thresh %.2f, thresh ineqal: %s, the weight error is %.3f" %(i, threshVal, inequal, weightedError))
                if weightedError < minError:
                    minError = weightedError
                    bestClasEst = predictedVals.copy()
                    bestStump['dim'] = i
                    bestStump['thresh'] = threshVal
                    bestStump['ineq'] = inequal
    return bestStump, minError, bestClasEst

D = np.mat(np.ones((5, 1)) / 5)
datMat, classLabels = loadSimpleData()
buildStump(datMat, classLabels, D)

split : dim 0, thresh 0.90, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 0.90, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.00, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 1.00, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.10, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 1.10, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.20, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 1.20, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.30, thresh ineqal: lessthan, the weight error is 0.200
split : dim 0, thresh 1.30, thresh ineqal: morethan, the weight error is 0.800
split : dim 0, thresh 1.40, thresh ineqal: lessthan, the weight error is 0.200
split : dim 0, thresh 1.40, thresh ineqal: morethan, the weight error is 0.800
split : dim 0, thresh 1.50, thresh ineqal: lessthan,

({'dim': 0, 'thresh': 1.3, 'ineq': 'lessthan'}, matrix([[0.2]]), array([[-1.],
        [ 1.],
        [-1.],
        [-1.],
        [ 1.]]))

In [4]:
def adaBoostTrainDS(dataArr, classLabels, numIt = 40):
    weakClassArr = [] #弱分类器数组
    m = np.shape(dataArr)[0]
    D = np.mat(np.ones((m ,1)) / m)
    aggClassEst = np.mat(np.zeros((m ,1)))#记录每个数据点的类别估计累计值
    for i in range(numIt):
        bestStump, error, classEst = buildStump(dataArr, classLabels, D) #找出最好的分类决策树bestStump,加权错误率error以及分类结果数组classEst
        alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16)))#算出alpha值
        bestStump['alpha'] = alpha #给bestStump字典添加一个key为‘alpha’
        weakClassArr.append(bestStump) #将bestStump该决策树添加到weakClassArr数组中
        expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)#算出-a/a
        D = np.multiply(D, np.exp(expon))#算出D*exp(-a/a)
        D = D / D.sum() #算出D*exp（-a/a） /sum(D)
        aggClassEst += alpha * classEst 
        #np.sign(a)，返回数组中各元素的正负符号，用1和-1表示，np.sign(aggClassEst)将根据aggClassEst将数据分为+1类或-1类，
        aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1)))#aggError数组中，分对的为1，分为为0
        errorRate = aggErrors.sum() / m
        print("total error: ", errorRate, "\n")
        if errorRate == 0.0:
            break
    print('weakClassArr', weakClassArr)
    return weakClassArr

classifierArray = adaBoostTrainDS(datMat, classLabels, 9) 

split : dim 0, thresh 0.90, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 0.90, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.00, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 1.00, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.10, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 1.10, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.20, thresh ineqal: lessthan, the weight error is 0.400
split : dim 0, thresh 1.20, thresh ineqal: morethan, the weight error is 0.600
split : dim 0, thresh 1.30, thresh ineqal: lessthan, the weight error is 0.200
split : dim 0, thresh 1.30, thresh ineqal: morethan, the weight error is 0.800
split : dim 0, thresh 1.40, thresh ineqal: lessthan, the weight error is 0.200
split : dim 0, thresh 1.40, thresh ineqal: morethan, the weight error is 0.800
split : dim 0, thresh 1.50, thresh ineqal: lessthan,

In [15]:
def adaClassify(datToClass, classifierArr):#datToClass为要分类的数据，classifierArr为分类器集合
    dataMatrix = np.mat(datToClass)#将datToClass转为矩阵便于之后矩阵运算
    m = np.shape(dataMatrix)[0] #m为矩阵行数即要分类的点的数量
    aggClassEst = np.mat(np.zeros((m, 1)))
    for i in range(len(classifierArr)):#for循环依次将数据矩阵和分类器集合的每一个分类器输入到stumClassify（）中进行分类
        classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
        aggClassEst += classifierArr[i]['alpha'] * classEst #将每一个分类器分类的结果乘上分类器的权值再求总和得到加权分类结果
    return np.sign(aggClassEst) #将加权分类结果输入到sign（）中得出它的最终分类

datArr, labelArr = loadSimpleData()
classifierArr = adaBoostTrainDS(datArr, labelArr, 30)
adaClassify([0, 0], classifierArr)

total error:  0.2 

total error:  0.2 

total error:  0.0 

weakClassArr [{'dim': 0, 'thresh': 1.3, 'ineq': 'lessthan', 'alpha': 0.6931471805599453}, {'dim': 1, 'thresh': 1.0, 'ineq': 'lessthan', 'alpha': 0.9729550745276565}, {'dim': 0, 'thresh': 0.9, 'ineq': 'lessthan', 'alpha': 0.8958797346140273}]
aaa [{'dim': 0, 'thresh': 1.3, 'ineq': 'lessthan', 'alpha': 0.6931471805599453}, {'dim': 1, 'thresh': 1.0, 'ineq': 'lessthan', 'alpha': 0.9729550745276565}, {'dim': 0, 'thresh': 0.9, 'ineq': 'lessthan', 'alpha': 0.8958797346140273}]
[0, 0]
classEst [[-1.]]
aggClassEst [[-0.69314718]]
classEst [[-1.]]
aggClassEst [[-1.66610226]]
classEst [[-1.]]
aggClassEst [[-2.56198199]]


matrix([[-1.]])

In [5]:
def loadDataSet(fileName): #加载数据
    numFeat = len(open(fileName).readline().split('\t'))
    dataMat = []
    labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
        for i in range(numFeat - 1):
            lineArr.append(float(curLine[i]))
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat, labelMat
#加载训练集
datArr, labelArr = loadDataSet('F://MachineLearingInActionSourceCode//Ch07//horseColicTraining2.txt')
classifierArray = adaBoostTrainDS(datArr, labelArr, 10)#训练分类器
#加载测试集
testArr, testLabelArr = loadDataSet('F://MachineLearingInActionSourceCode//Ch07//horseColicTraining2.txt')
prediction10 = adaClassify(testArr, classifierArray)#将分类器用于分类测试集中

split : dim 0, thresh 0.90, thresh ineqal: lessthan, the weight error is 0.405
split : dim 0, thresh 0.90, thresh ineqal: morethan, the weight error is 0.595
split : dim 0, thresh 1.00, thresh ineqal: lessthan, the weight error is 0.438
split : dim 0, thresh 1.00, thresh ineqal: morethan, the weight error is 0.562
split : dim 0, thresh 1.10, thresh ineqal: lessthan, the weight error is 0.438
split : dim 0, thresh 1.10, thresh ineqal: morethan, the weight error is 0.562
split : dim 0, thresh 1.20, thresh ineqal: lessthan, the weight error is 0.438
split : dim 0, thresh 1.20, thresh ineqal: morethan, the weight error is 0.562
split : dim 0, thresh 1.30, thresh ineqal: lessthan, the weight error is 0.438
split : dim 0, thresh 1.30, thresh ineqal: morethan, the weight error is 0.562
split : dim 0, thresh 1.40, thresh ineqal: lessthan, the weight error is 0.438
split : dim 0, thresh 1.40, thresh ineqal: morethan, the weight error is 0.562
split : dim 0, thresh 1.50, thresh ineqal: lessthan,

split : dim 9, thresh 3.00, thresh ineqal: lessthan, the weight error is 0.455
split : dim 9, thresh 3.00, thresh ineqal: morethan, the weight error is 0.545
split : dim 9, thresh 3.50, thresh ineqal: lessthan, the weight error is 0.455
split : dim 9, thresh 3.50, thresh ineqal: morethan, the weight error is 0.545
split : dim 9, thresh 4.00, thresh ineqal: lessthan, the weight error is 0.424
split : dim 9, thresh 4.00, thresh ineqal: morethan, the weight error is 0.576
split : dim 9, thresh 4.50, thresh ineqal: lessthan, the weight error is 0.424
split : dim 9, thresh 4.50, thresh ineqal: morethan, the weight error is 0.576
split : dim 9, thresh 5.00, thresh ineqal: lessthan, the weight error is 0.407
split : dim 9, thresh 5.00, thresh ineqal: morethan, the weight error is 0.593
split : dim 10, thresh -0.40, thresh ineqal: lessthan, the weight error is 0.593
split : dim 10, thresh -0.40, thresh ineqal: morethan, the weight error is 0.407
split : dim 10, thresh 0.00, thresh ineqal: less

split : dim 5, thresh 3.20, thresh ineqal: morethan, the weight error is 0.471
split : dim 5, thresh 3.60, thresh ineqal: lessthan, the weight error is 0.529
split : dim 5, thresh 3.60, thresh ineqal: morethan, the weight error is 0.471
split : dim 5, thresh 4.00, thresh ineqal: lessthan, the weight error is 0.526
split : dim 5, thresh 4.00, thresh ineqal: morethan, the weight error is 0.474
split : dim 6, thresh -0.40, thresh ineqal: lessthan, the weight error is 0.474
split : dim 6, thresh -0.40, thresh ineqal: morethan, the weight error is 0.526
split : dim 6, thresh 0.00, thresh ineqal: lessthan, the weight error is 0.431
split : dim 6, thresh 0.00, thresh ineqal: morethan, the weight error is 0.569
split : dim 6, thresh 0.40, thresh ineqal: lessthan, the weight error is 0.431
split : dim 6, thresh 0.40, thresh ineqal: morethan, the weight error is 0.569
split : dim 6, thresh 0.80, thresh ineqal: lessthan, the weight error is 0.431
split : dim 6, thresh 0.80, thresh ineqal: moretha

split : dim 8, thresh 2.10, thresh ineqal: lessthan, the weight error is 0.466
split : dim 8, thresh 2.10, thresh ineqal: morethan, the weight error is 0.534
split : dim 8, thresh 2.40, thresh ineqal: lessthan, the weight error is 0.466
split : dim 8, thresh 2.40, thresh ineqal: morethan, the weight error is 0.534
split : dim 8, thresh 2.70, thresh ineqal: lessthan, the weight error is 0.466
split : dim 8, thresh 2.70, thresh ineqal: morethan, the weight error is 0.534
split : dim 8, thresh 3.00, thresh ineqal: lessthan, the weight error is 0.462
split : dim 8, thresh 3.00, thresh ineqal: morethan, the weight error is 0.538
split : dim 9, thresh -0.50, thresh ineqal: lessthan, the weight error is 0.538
split : dim 9, thresh -0.50, thresh ineqal: morethan, the weight error is 0.462
split : dim 9, thresh 0.00, thresh ineqal: lessthan, the weight error is 0.499
split : dim 9, thresh 0.00, thresh ineqal: morethan, the weight error is 0.501
split : dim 9, thresh 0.50, thresh ineqal: lesstha

split : dim 8, thresh -0.30, thresh ineqal: morethan, the weight error is 0.471
split : dim 8, thresh 0.00, thresh ineqal: lessthan, the weight error is 0.502
split : dim 8, thresh 0.00, thresh ineqal: morethan, the weight error is 0.498
split : dim 8, thresh 0.30, thresh ineqal: lessthan, the weight error is 0.502
split : dim 8, thresh 0.30, thresh ineqal: morethan, the weight error is 0.498
split : dim 8, thresh 0.60, thresh ineqal: lessthan, the weight error is 0.502
split : dim 8, thresh 0.60, thresh ineqal: morethan, the weight error is 0.498
split : dim 8, thresh 0.90, thresh ineqal: lessthan, the weight error is 0.502
split : dim 8, thresh 0.90, thresh ineqal: morethan, the weight error is 0.498
split : dim 8, thresh 1.20, thresh ineqal: lessthan, the weight error is 0.515
split : dim 8, thresh 1.20, thresh ineqal: morethan, the weight error is 0.485
split : dim 8, thresh 1.50, thresh ineqal: lessthan, the weight error is 0.515
split : dim 8, thresh 1.50, thresh ineqal: morethan

split : dim 7, thresh 2.40, thresh ineqal: morethan, the weight error is 0.483
split : dim 7, thresh 3.00, thresh ineqal: lessthan, the weight error is 0.506
split : dim 7, thresh 3.00, thresh ineqal: morethan, the weight error is 0.494
split : dim 7, thresh 3.60, thresh ineqal: lessthan, the weight error is 0.506
split : dim 7, thresh 3.60, thresh ineqal: morethan, the weight error is 0.494
split : dim 7, thresh 4.20, thresh ineqal: lessthan, the weight error is 0.494
split : dim 7, thresh 4.20, thresh ineqal: morethan, the weight error is 0.506
split : dim 7, thresh 4.80, thresh ineqal: lessthan, the weight error is 0.494
split : dim 7, thresh 4.80, thresh ineqal: morethan, the weight error is 0.506
split : dim 7, thresh 5.40, thresh ineqal: lessthan, the weight error is 0.443
split : dim 7, thresh 5.40, thresh ineqal: morethan, the weight error is 0.557
split : dim 7, thresh 6.00, thresh ineqal: lessthan, the weight error is 0.452
split : dim 7, thresh 6.00, thresh ineqal: morethan,

NameError: name 'adaClassify' is not defined