## Problem 2: Medical Diagnostics

**2. Now, suppose that the hypothesis space consists of only height 1 decision trees for this data
set (only one attribute split).**



In [5]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import math
from tqdm.notebook import tqdm, trange

%matplotlib inline

In [6]:
heart_train = pd.read_csv('heart_train.data', header=None)
heart_test = pd.read_csv('heart_test.data', header=None)

# Changing 0 class to -1 to predict using Sign function
heart_train.loc[heart_train[0] == 0, 0] = -1
heart_test.loc[heart_test[0] == 0, 0] = -1

# Split X, Y
y_train, X_train = heart_train.iloc[:, 0], heart_train.iloc[:, 1:]
y_test, X_test = heart_test.iloc[:, 0], heart_test.iloc[:, 1:]

attributes = X_train.columns
m, n = X_train.shape
classes = y_train.unique()

In [7]:
def generate_1_attr_hypotheses(attributes, classes):
    hypotheses = []
    for l0 in attributes:
        for leaf1 in classes:
            for leaf2 in classes:
                h = {}
                h[l0] = {}
                h[l0][0] = leaf1
                h[l0][1] = leaf2
                hypotheses.append(h)
    return hypotheses

H = generate_1_attr_hypotheses(attributes, classes)
assert len(attributes) * (2**2) == len(H)


**(a) Use coordinate descent to minimize the exponential loss function for this hypothesis
space over the training set. You can use any initialization and iteration order that you
would like other than the one selected by adaBoost. What is the optimal value of α that
you arrived at? What is the corresponding value of the exponential loss on the training
set?**

In [32]:
def predict(X, h):
    for l0 in h.keys():
        val = X[l0]
        pred = h[l0][val]
        if not isinstance(pred, dict):
            return pred
        else:
            for l1 in pred.keys():
                val = X[l1]
                return pred[l1][val]
            

def boosting_predict(a, H=None, x=None, h_x=None):
    if h_x is None:
        h_x = []
        for h in H:
            y_pred = x.apply(lambda row: predict(row, h), axis=1)
            h_x.append(y_pred)
        h_x = np.array(h_x)
    return np.sign(a.dot(h_x))

def accuracy(y_truth, y_pred):
    return np.mean(y_truth == y_pred) * 100

def coordinate_descent(y_train, ht_x, H):
    def compute_loss(t_prime, y_train, ht_x, H):
        loss_n = loss_d = 0
        for m in range(len(y_train)):
            inner_sum = 0
            for t in range(T):
                if t_prime != t:
                    a_t = alphas[t]
                    inner_sum += a_t * ht_x[t][m]
            y_t_prime = ht_x[t_prime][m]
            if y_t_prime == y_train[m]:
                loss_n += np.exp(-1 * y_train[m] * inner_sum)
            else:
                loss_d += np.exp(-1 * y_train[m] * inner_sum)
        return loss_n, loss_d
    T = len(H)
    alphas = np.array([1/len(y_train)] * T)
    alpha_change = 1
    changes = []
    iter_counter = 0
    while alpha_change > 0.01:
        start_alphas = np.copy(alphas)
        alpha_change = 0
        for t_prime in range(T):
            loss_n, loss_d = compute_loss(t_prime, y_train, ht_x, H)
            alpha_t_prime = 0.5 * np.log(loss_n / loss_d)
            alphas[t_prime] = alpha_t_prime
        for i in range(len(alphas)):
            alpha_change += abs(alphas[i] - start_alphas[i])
        if iter_counter % 100 == 0:
            print(f"Iteration {iter_counter} - Alpha change {alpha_change}")
        iter_counter += 1
    return alphas

In [33]:
ht_x = []
for h in H:
    y_pred = X_train.apply(lambda row: predict(row, h), axis=1)
    ht_x.append(y_pred)
ht_x = np.array(ht_x)

In [34]:
alphas = coordinate_descent(y_train, ht_x, H)

In [14]:
print("Alpha values")
print(alphas)
loss = np.sum(np.exp(-y_train * alphas.dot(ht_x)))

print("***********************************************************")
print("Training Loss: ", loss)
print("***********************************************************")

Alpha values
[ 0.0125     -0.16432002  0.0125     -0.03718624  0.0125     -0.14933684
  0.0125     -0.10099259  0.0125     -0.18058637  0.0125     -0.08432352
  0.0125     -0.17827792  0.0125     -0.10761323  0.0125      0.04683466
  0.0125      0.02717568  0.0125     -0.0697316   0.0125     -0.05131991
  0.0125     -0.19045047  0.0125     -0.08232813  0.0125     -0.19724345
  0.0125     -0.08344875  0.0125      0.0079193   0.0125      0.00947466
  0.0125      0.03249193  0.0125      0.02045813  0.0125     -0.1273416
  0.0125     -0.06974654  0.0125     -0.05772459  0.0125     -0.03036459
  0.0125     -0.2920413   0.0125     -0.11183699  0.0125      0.09516313
  0.0125      0.06324829  0.0125     -0.02606898  0.0125     -0.02301836
  0.0125     -0.08694109  0.0125     -0.05881137  0.0125     -0.09471794
  0.0125     -0.08430297  0.0125     -0.03309757  0.0125     -0.03110784
  0.0125      0.02535473  0.0125      0.02055283  0.0125     -0.16226708
  0.0125     -0.10102695  0.0125      0


**(b) What is the accuracy of the resulting classifier on the test data?**


In [12]:
coor_pred = boosting_predict(alphas, H=H, x=X_test)
print("***********************************************************")
print("Coordinate Test Accuracy", accuracy(y_test.ravel(), coor_pred.flatten()))
print("***********************************************************")

***********************************************************
Coordinate Test Accuracy 69.5187165775401
***********************************************************



**(c) What is the accuracy of adaBoost after 20 rounds for this hypothesis space on the test
data?**

In [38]:
def adaboost(H, X_train, y_train):
    m, n = X_train.shape
    w = np.array([1/m] * m)
    alphas = [0] * T
    epsilons = [0] * T
    selected_H = [None] * T
    y_predictions = [None] * T
    best_idxs = []
    print("Running Adaboost")
    for t in range(T):
        e_t = 1
        h_t = None
        y_t = None
        best_i = 0
        h_i = 0
        tq = tqdm(H)
        tq.set_description(f"Round {t+1}")
        for h in tq:
            h_i += 1
            y_pred = X_train.apply(lambda row: predict(row, h), axis=1)
            mask = (y_pred != y_train).astype(np.float64)
            e_h = np.sum(mask * w)
            if e_h < e_t:
                e_t = e_h
                h_t = h
                y_t = y_pred
                best_i = h_i
        print(f"Round {t+1} - Best hypothesis index {best_i}")
        best_idxs.append(best_i)
        selected_H[t] = h_t
        y_predictions[t] = y_t
        epsilons[t] = e_t
        
        a_t = 0.5 * math.log((1-e_t)/e_t) # Log base e
        alphas[t] = a_t
        
        # Weight update
        normalize = 2 * np.sqrt(e_t * (1-e_t))
        w = w * np.exp(-1 * y_train * y_t * a_t)/normalize
    return np.array(alphas), np.array(epsilons), selected_H, np.array(y_predictions), best_idxs

T = 20
a, e, h_, y_, idxs = adaboost(H, X_train, y_train)

Running Adaboost


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 1 - Best hypothesis index 51


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 2 - Best hypothesis index 43


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 3 - Best hypothesis index 1


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 4 - Best hypothesis index 27


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 5 - Best hypothesis index 1


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 6 - Best hypothesis index 31


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 7 - Best hypothesis index 10


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 8 - Best hypothesis index 87


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 9 - Best hypothesis index 1


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 10 - Best hypothesis index 63


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 11 - Best hypothesis index 1


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 12 - Best hypothesis index 79


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 13 - Best hypothesis index 10


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 14 - Best hypothesis index 31


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 15 - Best hypothesis index 1


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 16 - Best hypothesis index 43


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 17 - Best hypothesis index 10


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 18 - Best hypothesis index 15


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 19 - Best hypothesis index 1


HBox(children=(IntProgress(value=0, max=88), HTML(value='')))


Round 20 - Best hypothesis index 67


In [39]:
pred = boosting_predict(a, H=h_, x=X_test)
print("***********************************************************")
print("AdaBoost Test Accuracy", accuracy(y_test.ravel(), pred.flatten()))
print("***********************************************************")

***********************************************************
AdaBoost Test Accuracy 66.84491978609626
***********************************************************


**(c) How does the α learned by adaBoost compare to the one learned by coordinate descent/gradient
descent?**



**(d) Use bagging, with 20 bootstrap samples, to produce an average classifier for this data
set. How does it compare to the previous classifiers in terms of accuracy on the test set?**


In [64]:
def fit_decision_stump(data, attributes):
    m, n = data.shape
    best_accuracy = 0
    best_h = 0
    for h in H:
        y_pred = data.apply(lambda row: predict(row, h), axis=1)
        acc = accuracy(data[0].ravel(), y_pred.ravel())
        if acc > best_accuracy:
            best_accuracy = acc
            best_h = h
    return best_h

In [65]:
B = 20
# Random forest
T = []
attrs = attributes.to_list()
for b in range(B):
    bootstrap_sample = heart_train.iloc[np.random.randint(m, size=m)]
#     T_b, best_split = fit_decision_stump_ig(bootstrap_sample, attrs)
    T_b = fit_decision_stump(bootstrap_sample, attrs)
    T.append(T_b)

In [66]:
from collections import defaultdict 

def stump_predict(T_b, x_row):
    for root in T_b.keys():
        val = x_row[root]
        return T_b[root][val]

def random_forest_predict(T, data):
    m, n = data.shape
    y_pred = np.array([0] * m)
    for i in range(m):
        row = data.loc[i, :]
        preds = defaultdict(int)
        for t_b in T:
            preds[stump_predict(t_b, row)] += 1
        if preds[-1] > preds[1]:
            y_pred[i] = -1
        else:
            y_pred[i] = 1
    return y_pred

test_accuracy = accuracy(random_forest_predict(T, X_test), y_test.ravel()) 

print("***********************************************************")
print("Bagging Test accuracy", test_accuracy)
print("***********************************************************")

***********************************************************
Bagging Test accuracy 61.49732620320856
***********************************************************



**(e) Which of these 3 methods should be preferred for this data set and why**

Adaboost and Coordinate descent methods can be used for this data set with this hypothesis space. Decision stumps are not expressive, so bagging would not be much useful for improving the accuracy. 