# Implementing a boosting algorithm for a classifier

First we will implement a least square linear regression classifier treating the y labels as 1 and -1. This is not a very good classifier.

Hopefully using the Adaboost algorithm will help improve this

# Simple Regression From Scratch

In [41]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [3]:
class Regression():
    """Implementation of  regression from scratch.
    """

    def __init__(self) -> None:
        self._lambda = 0.1
        self.X_maxs = None
        self.X_means = None

    def fit(self, X, y):
        """Fit the model to training data

        Args:
            X (np.array): Training data. Columns for features. Rows for test instances.
            y (np.array): Training target column. Must have same number of rows as X

        Returns:
            (np.array, np.array): Ridge regression weights, degrees of freedom.
        """
        self.calc_means_and_stds(X)
        X = self.prepare_X(X)
        self.n, self.k = X.shape
        w_ls = np.linalg.inv((X.T @ X)) @ X.T @ y
        self.weights = w_ls
        return w_ls

    def calc_means_and_stds(self, X):
        """Calculate and store the means and std.

        These will be used to normalize new data

        Args:
            X (np.array): Training data
        """
        self.X_maxs = X.max(axis=0)
        self.X_means = X.mean(axis=0)
        self.X_stds = X.std(axis=0)
        self.X_means[self.X_stds == 0] = 0 # Don't want to subtract any means if variance is zero (for 1 column)
        self.X_stds[self.X_stds == 0] = 1 # If STD is zero force it to one.

    def prepare_X(self, X):
        """Prepares any X data for model usage

        Subtract the column mean and normalize by variance.

        Args:
            X (np.array): Input array
        """
        if self.X_means is None:
            raise ValueError("Need to calc_means_and_stds at least once before this function.")
        X = (X - self.X_means) / self.X_stds
        
        return X

    def predict(self, X_in):
        """Predict y values for new X input

        Args:
            X_in (np.array): New data. Must have same shape as data this was trained on.

        Returns:
            np.array: The predicted data
        """
        X_prepared = self.prepare_X(X_in)
        yhat = X_prepared @ self.weights
        return yhat

In [4]:
base_path = "/Users/trevorgordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/"

X_train_all = pd.read_csv(base_path + "hw3-data/Prob1_X.csv", header=None).to_numpy()
y_train_all = pd.read_csv(base_path + "hw3-data/Prob1_y.csv", header=None).to_numpy()


In [73]:
from sklearn.model_selection import train_test_split
import math
import pdb

X_train, X_test, y_train, y_test = train_test_split(X_train_all, y_train_all, test_size=0.1, random_state=42)

rr = Regression()
w_rr = rr.fit(X_train, y_train)
y_pred = rr.predict(X_test) > 0
is_correct = (y_pred == y_test).reshape(y_test.shape)
acc = np.count_nonzero(is_correct) / len(y_pred)

print(f"Initial Accuracy is {100*acc:.1f}%")

Initial Accuracy is 50.0%


# Adaboost

The general idea is to create an ensemble of models where each model is given a random subset of the data

In [121]:
from collections import Counter

class Boosting():

    def __init__(self, num_b):
        self.num_b = num_b
        self.model = self.get_model()

    def get_model(self):
        raise NotImplementedError()


class BoostedRegression(Boosting):

    def __init__(self, num_b):
        return super().__init__(num_b)

    def get_model(self):
        return Regression()

    def fit(self, X_train, y_train):
        self.n = len(X_train)
        self.weights = (1/self.n)*np.ones(self.n, dtype=np.double)

        self.models = []
        self.model_confidence = []
        self.all_weights = []
        self.all_errors = []

        train_err = []


        for b_i in range(self.num_b):
            # print(f"Running loop {b_i}")
            # pdb.set_trace()
            try:
                choices = np.random.choice(range(self.n), p=self.weights, size=self.n)
            except ValueError:
                pdb.set_trace()

            # print("Choices")
            counted = Counter(choices)
            choice_to_print = {x: counted[x] for x in range(5)}
            # print(choice_to_print)
            X_train_b = X_train[choices]
            y_train_b = y_train[choices]

            new_model = Regression()

            new_model.fit(X_train_b, y_train_b)
            # y_pred_b = new_model.predict(X_train_b) > 0
            y_pred_m = np.sign(new_model.predict(X_train))
            # y_pred_b = np.apply_along_axis(lambda x: 1 if x else -1, 1, y_pred_b).reshape(y_train_b.shape)
            is_correct = (y_train == y_pred_m).reshape(y_pred_m.shape)
            correct_pos_neg = np.apply_along_axis(lambda correct: 1 if correct else -1, 1, is_correct)

            num_correct = np.count_nonzero(is_correct)
            # print(num_correct)
            # acc = num_correct / len(y_train_b)
            # print(f"Acc is {acc}")
            # train_err.append(1-acc)
            
            error = sum([w for w, c in zip(self.weights, is_correct) if not c])
            # alpha = 0.5 * np.log((1-error)/(error))
            zero_error = 10**(-308)
            
            alpha = 0.5 * np.log((1-(error + zero_error))/(error + zero_error))
            weight_scaling = np.exp(-1*alpha*correct_pos_neg)

            new_weights = (weight_scaling * self.weights)
            new_weights = new_weights / sum(new_weights)

            self.all_weights.append(new_weights)
            self.all_errors.append(error)

            # Add model
            self.models.append(new_model)
            self.model_confidence.append(alpha)

            self.weights = new_weights

            y_pred_m = self.predict(X_train) > 0
            y_pred_m = y_pred_m.reshape(y_train_b.shape)
            y_pred_m = np.apply_along_axis(lambda x: 1 if x else -1, 1, y_pred_m).reshape(y_train_b.shape)
            is_correct = (y_train == y_pred_m).reshape(y_pred_m.shape)

            num_correct = np.count_nonzero(is_correct)
            acc = num_correct / len(y_train_b)
            train_err.append(1-acc)

        return train_err

    def predict(self, X_in):

        out = np.zeros((len(X_in)))

        for m, conf in zip(self.models, self.model_confidence):
            if conf > 300:
                pdb.set_trace()
            y_pred_b = np.sign(m.predict(X_in)).reshape(out.shape)
            out += conf*y_pred_b

        return out

            

In [124]:
b = BoostedRegression(2500)
training_err = b.fit(X_train, y_train)


y_pred = b.predict(X_test) > 0
y_pred = y_pred.reshape(y_test.shape) > 0
y_pred = np.apply_along_axis(lambda x: 1 if x else -1, 1, y_pred).reshape(y_test.shape)


is_correct = (y_pred == y_test)
acc = np.count_nonzero(is_correct) / len(y_pred)

print(f"After boosting Accuracy is {100*acc:.1f}%")

After boosting Accuracy is 100.0%


In [125]:
import plotly.express as px
fig = px.line(y=training_err, x=range(len(training_err)), title=f"Training Error For Boosting Algorithm")
fig.show()

In [126]:
df_training_err = pd.DataFrame(training_err)

In [127]:

training_err_running_avg = np.cumsum(training_err)/range(1, len(training_err)+1)



upper_bound_error_df = pd.DataFrame(b.all_errors, columns=["e"])
upper_bound_error_df["one_half_err"] = upper_bound_error_df["e"].apply(lambda x: (0.5 - x)**2)
upper_bound_error_df["upper_bound_err"] = upper_bound_error_df["one_half_err"].cumsum()
upper_bound_error_df["upper_bound_err"] = np.exp(-2*upper_bound_error_df["upper_bound_err"])
upper_bound_error_df["training_err_running_avg"] = training_err_running_avg
upper_bound_error_df["training_err"] = training_err
upper_bound_error_df["x"] = range(len(upper_bound_error_df))




In [128]:
import plotly.express as px
fig = px.line(upper_bound_error_df, x="x", y=["training_err", "upper_bound_err"], title=f"Training Error For Boosting Algorithm")
fig.update_layout(yaxis_title="Training Error")
fig.show()

In [129]:
weights_over_time = pd.DataFrame(np.array(b.all_weights))
print(weights_over_time.shape)
weights_over_time["iter"] = range(len(weights_over_time))
weights_over_time = weights_over_time.melt(id_vars=["iter"])

(2500, 900)


In [136]:
import plotly.express as px

# Commenting plot as plot size is too large for notebook on git
# fig = px.line(weights_over_time, x="iter", y="value", color="variable", title=f"Weights Over Time For Boosting Algorithm")
# fig.show()


In [15]:
fig.write_image("/Users/loreliegordon/Library/Mobile Documents/com~apple~CloudDocs/Documents/root/Columbia/Fall2021/ELEN4720/Assignments/assignment3/submission/boosting_weights.png")

In [131]:
weights_over_time = pd.DataFrame(np.array(b.all_weights))
print(weights_over_time.shape)
weights_over_time["iter"] = range(len(weights_over_time))
weights_over_time = weights_over_time.melt(id_vars=["iter"])
weights_over_time_avg = weights_over_time.drop("variable", axis=1).groupby("iter", as_index=False).mean()

(2500, 900)


In [138]:
fig = px.line(weights_over_time_avg, x="iter", y="value", title=f"Avg Weights Over Time For Boosting Algorithm")
# Commenting plot as plot size is too large for notebook on git
# fig.show()

In [133]:
alpha = b.model_confidence
fig = px.line(alpha, title=f"Alpha Over Time For Boosting Algorithm")
fig.show()

In [137]:
alpha = b.all_errors
fig = px.line(alpha, title=f"Error Over Time For Boosting Algorithm")
# Commenting plot as plot size is too large for notebook on git
#fig.show()