Train model using the adaBoost algorithm: 
1. initialized weight as 1/M
2. select classifier with least error
3. compute alpha
4. update weight
5. repeat 2-4 for a fixed number of iterations

File Paths:  
- training data: data/heart_train.data
- test data: data/heart_test.data

In [1]:
import numpy as np

#Decision Stump class (decision tree with single attribute split)
class DecisionStump: 
    def __init__(self): 
        self.attribute_index = None
        self.polarity = None    #left: -1 or left: 1 for label
        self.threshold = None   #< 1 or < -1
        self.alpha = None
    
    #get pred_Y for decision stump 
    def predict(self, X): 
        attribute_set = X[:, self.attribute_index]
        pred_Y = np.zeros(len(attribute_set))
        
        for i in range(len(attribute_set)): 
            if self.polarity == -1: 
                if attribute_set[i] < self.threshold: 
                    pred_Y[i] = -1
                else: 
                    pred_Y[i] = 1
            else:   
                if attribute_set[i] < self.threshold: 
                    pred_Y[i] = 1
                else: 
                    pred_Y[i] = -1

        return pred_Y


#AdaBoost algorithm class
class AdaBoost: 
    def __init__(self, rounds=10): 
        self.rounds = rounds
        self.all_classifiers = []

    #training of model using AdaBoost algorithm
    def train(self, X, Y):
        num_data, num_attributes = X.shape

        #initialize data weights as 1/M
        data_weights = np.ones(num_data) * (1 / num_data)

        #compute weak_classifiers for t rounds
        for _ in range(self.rounds): 
            classifier = DecisionStump()
            min_error = float('inf')

            #iterate through each attributes to select the classifier with least error 
            for i in range(num_attributes):
                attribute_set = X[:, i]
                thresholds = [-1, 1]
                polarity = [-1, 1]

                #try every possible classification for each attribute 
                for p in polarity:
                    for t in thresholds: 
                        pred_Y = self.predict(attribute_set, t, p)

                        #compute error
                        error = 0
                        for j in range(num_data): 
                            if Y[j] != pred_Y[j]:
                                error += data_weights[j]

                        #get classifier with least error 
                        if error < min_error: 
                            min_error = error
                            classifier.attribute_index = i
                            classifier.polarity = p
                            classifier.threshold = t
            
            #compute alpha
            classifier.alpha = 1/2 * np.log((1 - min_error) / min_error)

            #get pred_Y on the selected classifier
            pred_Y = classifier.predict(X)

            #update data weights
            data_weights = (data_weights * np.exp(-Y * pred_Y * classifier.alpha)) / (2 * np.sqrt(min_error * (1 - min_error)))
            
            #append classifer to list
            self.all_classifiers.append(classifier)
    
    #prediction of weak classifier 
    def predict(self, attribute_set, t, p): 
        pred_Y = np.zeros(len(attribute_set))
        
        for i in range(len(attribute_set)): 
            if p == -1: 
                if attribute_set[i] < t: 
                    pred_Y[i] = -1
                else: 
                    pred_Y[i] = 1
            else:   
                if attribute_set[i] < t: 
                    pred_Y[i] = 1
                else: 
                    pred_Y[i] = -1

        return pred_Y

    #final prediction 
    def final_predict(self, X): 
        sum = 0
        for t in range(self.rounds):
            sum += self.all_classifiers[t].alpha * self.all_classifiers[t].predict(X)
        return np.sign(sum)


#read file and store data to X and Y 
def read_data(filename): 
    file = open(filename, 'r', encoding='utf-8-sig')
    dataset = []
    for line in file:
        data = line.split(',')
        y_data = int(data[0]) 
        x_data = [float(x) for x in data[1:23]] 
        dataset.append((x_data, y_data))

    X = np.array([x for x, y in dataset])
    Y = np.array([y for x, y in dataset])
    #replace 0 with -1
    X = np.where(X == 0, -1, X)
    Y = np.where(Y == 0, -1, Y) 

    return X, Y

#compute accuracy 
def compute_accuracy(Y, pred_Y): 
    correct_predictions = 0

    for i in range(Y.shape[0]):
        if Y[i] == pred_Y[i]: 
            correct_predictions += 1
    return (correct_predictions / Y.shape[0]) * 100


#train dataset
train_X, train_Y = read_data('data/heart_train.data')
#test dataset
X, Y = read_data('data/heart_test.data')

#get final classifier
final_classifier = AdaBoost(rounds=10)
final_classifier.train(train_X, train_Y)

#get final prediction
pred_Y = final_classifier.final_predict(X)

#compute accuracy 
accuracy = compute_accuracy(Y, pred_Y)
print('accuracy:', accuracy, '%')

accuracy: 69.5187165775401 %
