In [83]:

# This script is used to randomly generate a set of data points, and apply the adaboost method to classify these data points. 



from numpy import *
#import data
from copy import deepcopy

import numpy as py

# Read data from Training or Testing file
def func_readData(filename,option):
    if option == 'train':
        fid = open(filename,'r')
        
        label = []
        data = None
        while True:
            fline = fid.readline()
            if len(fline) == 0:     #EOF
                break
            label.append(int(fline[0:fline.find(':')]))
            
            dataNew = []
            i = fline.find(':') + 1
            dataNew = [float(fline[i:fline.find(',',i,-1)])]
            while True:
                i = fline.find(',',i,-1) + 1
                if not i:
                    break;
                dataNew.append(float(fline[i:fline.find(',',i,-1)]))
            if data is None:
                data = py.mat(dataNew)
            else:
                data = py.vstack([data,py.mat(dataNew)])
        fid.close()
        return data,label
    elif option == 'test':
        fid = open(filename,'r')
        data = None
        while True:
            fline = fid.readline()
            if len(fline) == 0:     #EOF
                break    
            dataNew = []
            i=0
            while True:
                dataNew.append(float(fline[i:fline.find(',',i,-1)])) 
                i = fline.find(',',i,-1) + 1
                if not i:
                    break
            if data is None:
                data = py.mat(dataNew)
            else:
                data = py.vstack([data,py.mat(dataNew)])
        fid.close()
        return data
    else:
        print ('Wrong input parameter!')


# function for building weak classifiers, i.e.:  stump function

def buildWeakStump(d,l,D): # (data, label, weight)
    dataMatrix = py.mat(d)
    labelmatrix = py.mat(l).T
    m,n = py.shape(dataMatrix)
    numstep = 10.0
    bestStump = {}
    bestClass = py.mat(py.zeros((5,1)))
    minErr = py.inf
    for i in range(n):
        datamin = dataMatrix[:,i].min()
        datamax = dataMatrix[:,i].max()
        stepSize = (datamax - datamin) / numstep
        for j in range(-1,int(numstep)+1):
            for inequal in ['lt','gt']:
                threshold = datamin + float(j) * stepSize
                predict = stumpClassify(dataMatrix,i,threshold,inequal)
                err = py.mat(py.ones((m,1)))
                err[predict == labelmatrix] = 0
                weighted_err = D.T * err;
                if weighted_err < minErr:
                    minErr = weighted_err
                    bestClass = predict.copy()
                    bestStump['dim'] = i
                    bestStump['threshold'] = threshold
                    bestStump['ineq'] = inequal
    #print('bestStump',bestStump)
    #print('minErr',minErr)
    #print('bestClass',bestClass)
    return bestStump, minErr, bestClass

# Use a weak classifier, i.e. a decision stump, to classify data

def stumpClassify(datamat,dim,threshold,inequal):
    res = py.ones((py.shape(datamat)[0],1))
    if inequal == 'lt':
        res[datamat[:,dim] <= threshold] = -1.0
    else:
        res[datamat[:,dim] > threshold] = -1.0
    return res

# Boosting Algorithm

def train(data,label,numIt = 1000):
    report_train=[]
    weakClassifiers = []
    #m is the number of samples
    m = py.shape(data)[0]
    # sample weights, 1/m at the beginning
    D = py.mat(py.ones((m,1))/m) 
    
    estStrong = py.mat(py.zeros((m,1)))
    for i in range(numIt):
        # bestStump: weak classifier; error: error rate
        bestStump, error, classEstimate = buildWeakStump(data,label,D)
        
        
        ##### PLACEHOLDER 1 START ###
    # calculate the weight of the selected decision stump based on its error rate
        current_iteration = 1e-16 #small constant
        top = 1.0 - error
        bottom = error + current_iteration
        alpha =  float(0.5*log(top / bottom))##slide 29 from lecture 9
        ##### PLACEHOLDER 1 End ###
        
        # add one more field to bestStump, i.e. classifier weight
        bestStump['alpha'] = alpha
        # add bestStump to the list of weak classifiers
        weakClassifiers.append(bestStump)

        ##### PLACEHOLDER 2 START ###
        #calculate sample weights (of all samples) 
    # set sample weights
        negative_alpha = -1 * alpha
        y_i = mat(label)
        h_t = (negative_alpha*y_i).T#transpose
        weight_sample=py.multiply(h_t,classEstimate)##slide 31
        D= py.multiply(D,exp(weight_sample))##slide 31
        # normalize D
        D = D/D.sum()
        ##### PLACEHOLDER 2 End ###
        
        

        
        estStrong += classEstimate*alpha
        
        EnsembleErrors = py.multiply(py.sign(estStrong)!=py.mat(label).T,\
                                  py.ones((m,1)))  #Converte to float
        
        errorRate = EnsembleErrors.sum()/m
        
        #print ("current error:  ",errorRate)
        report_train.append(errorRate)
        if errorRate == 0.0:
            break
    return report_train, weakClassifiers

# Applying an adaboost classifier for a single data sample

def adaboostClassify(dataTest,classifier):
    dataMatrix = py.mat(dataTest)
    m = py.shape(dataMatrix)[0]
    estStrong = py.mat(py.zeros((m,1)))
    for i in range(len(classifier)):
        ##### PLACEHOLDER 3 START ###
        # call the function stumpClassify()
        class_threshold = classifier[i]['threshold']
        class_dim = classifier[i]['dim']
        class_ineq =classifier[i]['ineq']
        classEstimate = stumpClassify(dataMatrix,class_dim,class_threshold,class_ineq)
        # accumulate all predictions
        estStrong += classifier[i]['alpha']*classEstimate
        ##### PLACEHOLDER 3 START ###       
    return py.sign(estStrong)

# Applying an adaboost classifier for all testing samples
def test(dataSet,classifier):
    label = []
    for i in range(py.shape(dataSet)[0]):
        label.append(adaboostClassify(dataSet[i,:],classifier))
    return label


#############. main ##################
# The data files "train.txt" and "test.txt" are randomly generated by the function randomData() and are used to test your developed codes.

trainData,label = func_readData('train.txt','train')
testData = func_readData('test.txt','test')
'''copy_train = deepcopy(trainData)
copy_label = deepcopy(label)
new=[]
for i in range(len(trainData)):
    new.append([trainData[i],label[i]])
random.shuffle(new)
for i in range(len(trainData)):
    copy_train[i]=new[i][0]
    copy_label[i]=new[i][1]'''
#print(trainData)
'''trainData=copy_train
label=copy_label'''
#print(trainData)

#training
report,classifier = train(trainData,label,150)
print('done training\n')
#testing
label=test(testData,classifier)
print('done testing\n')
for i in range(len(report)):
    if i < len(label):
        print ("current error:  ",report[i],"\t\t test result:  ",label[i])
    else:
        print ("current error:  ",report[i])



done training

done testing

current error:   0.43 		 test result:   [[1.]]
current error:   0.43 		 test result:   [[-1.]]
current error:   0.395 		 test result:   [[1.]]
current error:   0.42 		 test result:   [[-1.]]
current error:   0.395 		 test result:   [[1.]]
current error:   0.4 		 test result:   [[1.]]
current error:   0.395 		 test result:   [[1.]]
current error:   0.41 		 test result:   [[1.]]
current error:   0.39 		 test result:   [[1.]]
current error:   0.39 		 test result:   [[1.]]
current error:   0.37 		 test result:   [[1.]]
current error:   0.375 		 test result:   [[-1.]]
current error:   0.37 		 test result:   [[-1.]]
current error:   0.375 		 test result:   [[1.]]
current error:   0.365 		 test result:   [[1.]]
current error:   0.365 		 test result:   [[1.]]
current error:   0.365 		 test result:   [[1.]]
current error:   0.37 		 test result:   [[-1.]]
current error:   0.355 		 test result:   [[-1.]]
current error:   0.365 		 test result:   [[1.]]
current error:  