# 使用AdaBoost 元算法进行病马死亡率的预测
使用horseColicTraining2.txt 文件作为训练集，
horseColicTest2.txt 文件作为测试集，使用基于单层决策树的
AdaBoost 算法（弱分类器数目为40）进行病马死亡率的预测。
（不使用sklearn 库）

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from jedi.api.helpers import infer
from matplotlib.pyplot import figure
from mpmath import matrix
from sklearn.preprocessing import StandardScaler


def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
    retArray=np.ones((np.shape(dataMatrix)[0],1))
    if threshIneq=='lt':
        retArray[dataMatrix[:,dimen]<=threshVal]=-1.0
    else:
        retArray[dataMatrix[:,dimen]>threshVal]=-1.0
    return retArray

def buildStump(dataArr,classLabels,D):
    dataMatrix=np.mat(dataArr)
    labelMat=np.mat(classLabels).T
    m,n=np.shape(dataMatrix)
    numSteps=10.0
    bestStump={}
    bestClasEst=np.mat(np.zeros((m,1)))
    minError=np.inf
    for i in range(n):
        rangeMin=dataMatrix[:,i].min()
        rangeMax=dataMatrix[:,i].max()
        stepSize=(rangeMax-rangeMin)/numSteps
        for j in range(-1,int(numSteps)+1):
            for inequal in ['lt','gt']:
                threshVal=(rangeMin+float(j)*stepSize)
                predictedVals=stumpClassify(dataMatrix,i,threshVal,inequal)
                errArr=np.mat(np.ones((m,1)))
                errArr[predictedVals==labelMat]=0
                weightedError=D.T*errArr
                # print("split: dim %d, thresh %.2f, thresh inequal: %s, the weighted error is %.3f"%(i,threshVal,inequal,weightedError))
                if weightedError<minError:
                    minError=weightedError
                    bestClasEst=predictedVals.copy()
                    bestStump['dim']=i
                    bestStump['thresh']=threshVal
                    bestStump['ineq']=inequal
    return bestStump,minError,bestClasEst

def adaBoostTrainDS(dataArr,classLabels,numIt=40):
    weakClassArr=[]
    m=np.shape(dataArr)[0]
    D=np.mat(np.ones((m,1))/m)
    aggClassEst=np.mat(np.zeros((m,1)))
    for i in range(numIt):
        bestStump,error,classEst=buildStump(dataArr,classLabels,D)
        # print('D:',D.T)
        alpha=float(0.5*np.log((1.0-error)/max(error,1e-16))[0,0])
        bestStump['alpha']=alpha
        weakClassArr.append(bestStump)
        # print('classEst:',classEst.T)
        expon=np.multiply(-1*alpha*np.mat(classLabels).T,classEst)
        D=np.multiply(D,np.exp(expon))
        D=D/D.sum()
        aggClassEst+=alpha*classEst

        # print('aggClassEst:',aggClassEst.T)
        aggErrors=np.multiply(np.sign(aggClassEst)!=np.mat(classLabels).T,np.ones((m,1)))
        errorRate=aggErrors.sum()/m
        print('total error:',errorRate)
        if errorRate==0.0:
            break
    return weakClassArr

def adaClassify(dattoClass,classifierArr):
    dataMatrix=np.mat(dattoClass)
    m=np.shape(dataMatrix)[0]
    aggClassEst=np.mat(np.zeros((m,1)))
    for i in range(len(classifierArr)):
        classEst=stumpClassify(dataMatrix,classifierArr[i]['dim'],classifierArr[i]['thresh'],classifierArr[i]['ineq'])
        aggClassEst+=classifierArr[i]['alpha']*classEst
        # print(aggClassEst)
    return np.sign(aggClassEst)
        
def plotROC(presStrengths,classLabels):
    cur=(1.0,1.00)
    ySum=0.0
    numPosClas=sum(np.array(classLabels)==1.0)
    yStep=1/float(numPosClas)
    xStep=1/float(len(classLabels)-numPosClas)
    sortedIndicies=presStrengths.argsort()
    fig=plt.figure()
    fig.clf()
    ax=plt.subplot(111)
    for index in sortedIndicies.tolist()[0]:
        if classLabels[index]==1.0:
            delX=0
            delY=yStep
        else:
            delX=xStep
            delY=0
            ySum+=cur[1]
        ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY],c='b')
        cur=(cur[0]-delX,cur[1]-delY)
    ax.plot([0,1],[0,1],'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC')
    ax.axis([0.1,0.1])
    plt.show()
    print('AUC:',ySum*xStep)
    

# datMat=np.matrix([[1.,2.1],[2.,1.1],[1.3,1.],[1.,1.],[2.,1.]])
# classLabels=[1.0,1.0,-1.0,-1.0,1.0]
# 
# D=np.mat(np.ones((5,1))/5)
# buildStump(datMat,classLabels,D)
# 
# classifierArr=adaBoostTrainDS(datMat,classLabels,9)
# 
# classifierArr=adaClassify([0,0],classifierArr)

dataTrain=pd.read_csv('C:/Users/Admin/Desktop/WHU study/programming/python/MachineLearning/exp2/horseColicTraining.txt', sep='\t', header=None)
dataTest=pd.read_csv('C:/Users/Admin/Desktop/WHU study/programming/python/MachineLearning/exp2/horseColicTest.txt', sep='\t', header=None)
dataTrain=np.array(dataTrain)
dataTest=np.array(dataTest)

for i in range(len(dataTrain[:,-1])):
    dataTrain[i,-1]=1 if dataTrain[i,-1]==1 else -1

for i in range(len(dataTest[:,-1])):
    dataTest[i,-1]=1 if dataTest[i,-1]==1 else -1

ss=StandardScaler()
dataTrain[:,:-1]=ss.fit_transform(dataTrain[:,:-1])
dataTest[:,:-1]=ss.transform(dataTest[:,:-1])

classifierArray=adaBoostTrainDS(dataTrain[:,:-1],dataTrain[:,-1],40)
prediction=adaClassify(dataTest[:,:-1],classifierArray)

errArr=np.mat(np.ones((67,1)))
print("test error rate:",errArr[prediction!=(np.mat(dataTest[:,-1]).T)].sum()/len(dataTest[:,-1]))

total error: 0.2842809364548495
total error: 0.2842809364548495
total error: 0.24749163879598662
total error: 0.24749163879598662
total error: 0.25418060200668896
total error: 0.2408026755852843
total error: 0.2408026755852843
total error: 0.22073578595317725
total error: 0.24749163879598662
total error: 0.23076923076923078
total error: 0.2408026755852843
total error: 0.2140468227424749
total error: 0.22742474916387959
total error: 0.21739130434782608
total error: 0.22073578595317725
total error: 0.21739130434782608
total error: 0.22073578595317725
total error: 0.22408026755852842
total error: 0.23076923076923078
total error: 0.2140468227424749
total error: 0.22408026755852842
total error: 0.21070234113712374
total error: 0.20735785953177258
total error: 0.20066889632107024
total error: 0.20735785953177258
total error: 0.2140468227424749
total error: 0.20735785953177258
total error: 0.2040133779264214
total error: 0.21070234113712374
total error: 0.2040133779264214
total error: 0.20735