Train model using Bagging algorithm: 
1. generate n bootstrap samples from original dataset
2. train each sample by finding the classifier with most info gain 
3. get prediction for each classifier 
4. use majority vote in all predictions to predict the unseen datapoint

File Paths:  
- training data: data/heart_train.data
- test data: data/heart_test.data

In [3]:
import numpy as np
import random

#read file and store data to X and Y 
def read_data(filename): 
    file = open(filename, 'r', encoding='utf-8-sig')
    dataset = []
    for line in file:
        data = line.split(',')
        y_data = int(data[0]) 
        x_data = [float(x) for x in data[1:23]] 
        dataset.append((x_data, y_data))

    X = np.array([x for x, y in dataset])
    Y = np.array([y for x, y in dataset])
    #replace 0 with -1
    X = np.where(X == 0, -1, X)
    Y = np.where(Y == 0, -1, Y) 

    return X, Y

#create a new sample of dataset randomly (with replacement)
def compute_bootstrap_sample(X, Y): 
    sample_X = []
    sample_Y = []
    num_data = X.shape[0]

    while len(sample_X) < num_data: 
        index = random.randrange(num_data)
        sample_X.append(X[index])
        sample_Y.append(Y[index])

    return np.array(sample_X), np.array(sample_Y)

#compute entropy
def calculate_entropy(label): 
    classes, class_count = np.unique(label, return_counts=True)
    total_count = np.sum(class_count)

    entropy_value = 0
    for i in range(len(classes)): 
        proportion = class_count[i] / total_count
        entropy_value += (-proportion) * np.log2(proportion)
    
    return entropy_value

#compute conditional entropy
def calculate_conditional_entropy(attribute_set, label):
    values, value_count = np.unique(attribute_set, return_counts=True)
    
    total_count = np.sum(value_count)

    conditional_entropy_value = 0
    #iterate through each value of the attribute
    for i in range(len(values)): 
        condition_set = np.array([label[y] for y in np.where(attribute_set == values[i])])
        conditional_entropy_value += (value_count[i] / total_count) * calculate_entropy(condition_set)

    return conditional_entropy_value

#select the best classifier (most info gain)
def select_classifier(X, Y): 
    max_info_gain = 0
    best_attribute_index = 0

    current_entropy = calculate_entropy(Y)

    #iterate through all attributes to find the one with most information gain 
    for i in range(X.shape[1]):
        attribute_set = X[:, i]
        
        conditional_entropy = calculate_conditional_entropy(attribute_set, Y)
        info_gain = current_entropy - conditional_entropy
        
        if info_gain >= max_info_gain: 
            max_info_gain = info_gain
            best_attribute_index = i
    
    return best_attribute_index

#prediction of classifier 
def classifier_prediction(X, Y, attribute_index): 
    attribute_set = X[:, attribute_index]
    pred_Y = np.ones(Y.shape[0])
    pred_Y[attribute_set == -1] = -1
    return pred_Y

#final prediction
def final_prediction(predictions, X): 
    pred_Y = []

    #count prediction of each index for each classifier 
    for prediction in zip(*predictions): 
        pos_count = 0
        neg_count = 0
        for i in range(len(prediction)): 
            if prediction[i] == 1: 
                pos_count += 1
            else: 
                neg_count += 1
        if pos_count >= neg_count: 
            pred_Y.append(1)
        else: 
            pred_Y.append(-1)

    return pred_Y
            
#compute accuracy 
def compute_accuracy(Y, pred_Y): 
    correct_predictions = 0

    for actual_y, pred_y in zip(Y, pred_Y):
        if actual_y == pred_y: 
            correct_predictions += 1
    return (correct_predictions / Y.shape[0]) * 100


#train dataset
train_X, train_Y = read_data('data/heart_train.data')
#test dataset
X, Y = read_data('data/heart_test.data')

bootstrap_sample_X = []
bootstrap_sample_Y = []
classifiers = []
predictions = []

#bagging algorithm
for i in range(20): 
    #create a new bootstrap sample
    sample_X, sample_Y = compute_bootstrap_sample(train_X, train_Y)
    bootstrap_sample_X.append(sample_X)
    bootstrap_sample_Y.append(sample_Y)

    #select a classifier 
    classifier = select_classifier(bootstrap_sample_X[i], bootstrap_sample_Y[i])
    classifiers.append(classifier)

    #compute prediction on classifier for unseen data
    prediction = classifier_prediction(X, Y, classifiers[i])
    predictions.append(prediction)

#final prediction
final_pred_Y = final_prediction(predictions, X)

accuracy = compute_accuracy(Y, final_pred_Y)
print('accuracy:', accuracy, '%')

accuracy: 60.42780748663101 %
