Date: 11/12/24

In [None]:
# Connect to drive to access data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

In [None]:
# Loading Datasets
train = pd.read_csv('datasets/coding5_train.csv')
train_y = train['Y'].to_numpy()
train_X = train.drop('Y', axis=1).to_numpy()
test = pd.read_csv('datasets/coding5_test.csv')
test_y = test['Y'].to_numpy()
test_X = test.drop('Y', axis=1).to_numpy()

## Pegasos Implementation

In [None]:
# Sub Gradient Helper Functions
def beta_delta_t(beta_t, lam, x_i, y_i, eval):
  if eval < 1:
    return (lam * beta_t) - (y_i * x_i)
  else:
    return lam * beta_t
def alpha_delta_t(y_i, eval):
  if eval < 1:
    return -1 * y_i
  else:
    return 0

# Runs Pegasos for one pass over dataset
def one_epoch_Pegasos(beta, alpha, t, lam, X, y):
  for i in range(len(train_y)):
    # Update t & learning rate
    t += 1
    lr = 1 / (lam * t)

    # Evaluate current datapoint
    eval = y[i] * (X[i].T @ beta + alpha)

    # Update Beta
    beta = beta - lr * beta_delta_t(beta, lam, X[i], y[i], eval)

    # Update Alpha
    alpha = alpha - lr * alpha_delta_t(y[i], eval)

  return beta, alpha, t

# Prediction helper function
def predict(beta, alpha, X):
  preds = X @ beta + alpha
  return np.array([5 if pred > 0 else 6 for pred in preds])

## Pegasos Testing

In [None]:
# Initalize Parameters
T = 20
# lam = 0.11288378916846883
lam = 0.5
beta = np.zeros(train_X.shape[1])
alpha = 0
np.random.seed(12345)

# Change the train labels to 1 & -1 to get algorithm working correctly
train_y_copy = train_y.copy()
train_y_copy[train_y_copy == 5] = 1
train_y_copy[train_y_copy == 6] = -1

# Run Pegasos for T iterations
for epoch in range(T):
  # Randomly shuffle datapoints
  idx = np.random.permutation(len(train_y))
  beta, alpha, t = one_epoch_Pegasos(beta, alpha, t, lam, train_X[idx], train_y_copy[idx])

In [None]:
# Predict on training data
train_predictions = predict(beta, alpha, train_X)

train_error = np.mean(train_predictions != train_y) * 100
print(f'Train Error: {train_error:.3f}%')

print("Train Confusion Table:")
pd.crosstab(train_y, train_predictions, rownames=['Actual'], colnames=['Predicted'])

Train Error: 0.500%
Train Confusion Table:


Predicted,5,6
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
5,99,1
6,0,100


In [None]:
test_predictions = predict(beta, alpha, test_X)

test_error = np.mean(test_predictions != test_y) * 100
print(f'Test Error: {test_error:.3f}%')

print("Teat Confusion Table:")
pd.crosstab(test_y, test_predictions, rownames=['Actual'], colnames=['Predicted'])

Test Error: 3.000%
Teat Confusion Table:


Predicted,5,6
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
5,289,11
6,7,293
