# Online Convex Optimization

In this notebook, we present the different algorithm implemented in this repository and compare them. Hyperparameters can be tuned globally for all the algorithms at the same time or individually, in each algorithm's own cell. This setup allows quick and easy comparison and can be used to experiment interractively with the different algoritms.

### Imports

In [None]:
import time
import numpy as np
import pandas as pd
import pathlib as Path
import seaborn as sns

In [None]:
from Algorithms.Adam import adaMax, adaMaxTemporal, adam, adamP, adamTemporal, adamproj
from Algorithms.Explo import sbeg, sreg
from Algorithms.GD import GradientDescent, projected_gd
from Algorithms.SGD import sgd, projected_sgd
from Algorithms.RFTL import adagrad, seg, smd
from Algorithms.ONS import ons
from Models.LinearSVM import LinearSVM
from utils import *

### Default Parameters
The default parameters for the algorithms. It can be particularized later in each specific algorithm block.

In [None]:
np.random.seed(123)

lr = 0.1
nepoch = 1
lbd = 1/3
z = 100
gamma = 1/8
verbose = 100

### Initialization
We start with loading and normalizing the data. We only conduct binary classification so we label images of zeros from MNIST as 1 and every other labels as -1.

In [None]:
mnist_train = pd.read_csv('mnist_train.csv', sep=',', header=None)   # Reading
# Extract data
train_data = mnist_train.values[:, 1:]
# Normalize data
train_data = train_data / np.max(train_data)
train_data = np.c_[train_data, np.ones(train_data.shape[0])]         # Add intercept
# Extract labels
train_labels = mnist_train.values[:, 0]
# if labels is not 0 => -1 (Convention chosen)
train_labels[np.where(train_labels != 0)] = -1
# if label is 0 ==> 1
train_labels[np.where(train_labels == 0)] = 1

mnist_test = pd.read_csv('mnist_test.csv', sep=',', header=None)
test_data = mnist_test.values[:, 1:]
test_data = test_data / np.max(test_data)
test_data = np.c_[test_data, np.ones(test_data.shape[0])]
test_labels = mnist_test.values[:, 0]
test_labels[np.where(test_labels != 0)] = -1
test_labels[np.where(test_labels == 0)] = 1

time_dict = {}

n, m = train_data.shape
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15, 10))

### Gradient Descent 

In [None]:
print("-----------GD----------- \n")
model = LinearSVM(m)
tic = time.time()

GDloss, wts = GradientDescent(model, train_data, train_labels, nepoch, lbd, verbose, lr)
time_dict['gd'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
GDacc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, Unconstrained GD algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, GDloss[-1], GDacc))

ax[0].plot(np.arange(nepoch), GDloss, label = 'gd')
GDaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(GDaccuracies, label = 'gd')
GDerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(GDerrors, label = 'gd')

### Constrained Gradient Descend
Gradient Descent with projection on $B_1(z)$

In [None]:
print("-----------c_GD - z="+str(z)+"----------- \n")
model = LinearSVM(m)
tic = time.time()

GDprojloss, wts = projected_gd(model, train_data, train_labels, nepoch, lbd, z, verbose, lr)
time_dict['c_gd z='+str(z)] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
GDacc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, constrained GD (radius {:3d}) algorithm has a losof {:1.6f} and accuracy {:1.6f}'.format(nepoch, z, GDprojloss[-1], GDacc))

ax[0].plot(np.arange(nepoch), GDprojloss, label = 'c_gd z='+str(z))
GDprojaccuracies = compute_accuracies(wts, test_data, test_labels, average=False)  # no average for gd
ax[1].plot(GDprojaccuracies, label = 'c_gd z='+str(z))
GDprojerrors = compute_errors(wts, test_data, test_labels, average=False)
ax[2].plot(GDprojerrors,label = 'c_gd z='+str(z))

### Stochastic Gradient Descent 

In [None]:
print("-----------SGD----------- \n")
model = LinearSVM(m)
tic = time.time()

SGDloss, wts = sgd(model, train_data, train_labels, nepoch, lbd, verbose, lr)
time_dict['sgd'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, Unconstrained SGD algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, SGDloss[-1], acc))

ax[0].plot(np.arange(nepoch), SGDloss, label = 'sgd')
SGDaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(SGDaccuracies, label = 'sgd')
SGDerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(SGDerrors, label = 'sgd')

### Constrained Stochastic Gradient Descent
Stochastic Gradient Descent with projection on $B_1(z)$

In [None]:
print("-----------c_SGD - z=" + str(z)+"----------- \n")
model = LinearSVM(m)
tic = time.time()

SGDprojloss, wts = projected_sgd(model, train_data, train_labels, nepoch, lbd, z, verbose, lr)
time_dict['c_sgd z='+str(z)] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, constrained SGD (radius {:3d}) algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, z, SGDprojloss[-1], acc))

ax[0].plot(np.arange(nepoch), SGDprojloss, label = 'c_sgd z='+str(z))
SGDprojaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(SGDprojaccuracies, label = 'c_sgd z='+str(z))
SGDprojerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(SGDprojerrors, label = 'c_sgd z='+str(z))

### Stochastic Miror Descent (SMD)

In [None]:
model = LinearSVM(m)
tic = time.time()

SMDprojloss, wts = smd(model, train_data, train_labels, nepoch, lbd, z, lr, verbose)
time_dict['smd'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, constrained SMD algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, SMDprojloss[-1], acc))

ax[0].plot(np.arange(nepoch), SMDprojloss, label = 'smd')
SMDprojaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(SMDprojaccuracies, label = 'smd')
SMDprojerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(SMDprojerrors, label = 'smd')

### Stochastic Exponential Gradient (SEG)

In [None]:
print("-----------SMD----------- \n")
model = LinearSVM(m)
tic = time.time()

SEGloss, wts = seg(model, train_data, train_labels, nepoch, lbd, z, lr, verbose)
time_dict['seg'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, constrained SEG algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, SEGloss[-1], acc))

ax[0].plot(np.arange(nepoch), SEGloss, label = 'seg')
SEGaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(SEGaccuracies, label = 'seg')
SEGerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(SEGerrors, label = 'seg')

### Adaptative Gradient (Adagrad)

In [None]:
print("-----------Adagrad - z=" + str(z)+"----------- \n")
model = LinearSVM(m)
tic = time.time()

Adagradloss, wts = adagrad(model, train_data, train_labels, nepoch, lbd, z, verbose)
time_dict['adagrad'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, constrained Adagrad (radius {:3d}) algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, z, Adagradloss[-1], acc))

ax[0].plot(np.arange(nepoch), Adagradloss, label = 'adagrad z='+str(z))
Adagradaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(Adagradaccuracies, label = 'adagrad z='+str(z))
Adagraderrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(Adagraderrors,  label = 'adagrad z='+str(z))

### Online Newton Step (ONS)

In [None]:
print("-----------ONS - z=" + str(z) + "----------- \n")
model = LinearSVM(m)
tic = time.time()

ONSloss, wts = ons(model, train_data, train_labels, nepoch, lbd, gamma, z, verbose)
time_dict['ons'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, ONS (radius {:3d} algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, z, ONSloss[-1], acc))

ax[0].plot(np.arange(nepoch), ONSloss,  label = 'ons z='+str(z))
ONSaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(ONSaccuracies,  label = 'ons z='+str(z))
ONSerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(ONSerrors,  label = 'ons z='+str(z))

### SREG

In [None]:
print("-----------SREG - z=" + str(z) + "----------- \n")
model = LinearSVM(m)
tic = time.time()

SREGloss, wts = sreg(model, train_data, train_labels, nepoch, lbd, z, lr, verbose)
time_dict['sreg'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, constrained SREG (radius {:3d}) algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, z, SREGloss[-1], acc))

ax[0].plot(np.arange(nepoch), SREGloss, label = 'sreg z='+str(z))
SREGaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(SREGaccuracies, label = 'sreg z='+str(z))
SREGerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(SREGerrors, label = 'sreg z='+str(z))

### SBEG

In [None]:
print("-----------SBEG - z=" + str(z) + "----------- \n")
model = LinearSVM(m)
tic = time.time()

SBEGloss, wts = sbeg(model, train_data, train_labels, nepoch, lbd, z, lr, verbose)
time_dict['sbeg'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, constrained SBEG algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, SBEGloss[-1], acc))

ax[0].plot(np.arange(nepoch), SBEGloss, label = 'sbeg z='+str(z))
SBEGaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(SBEGaccuracies, label = 'sbeg z='+str(z))
SBEGerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(SBEGerrors, label = 'sbeg z='+str(z))

### Adam

In [None]:
print("-----------Adam - z=" + str(z) + "----------- \n")
model = LinearSVM(m)
tic = time.time()

Adamloss, wts = adam(model, train_data, train_labels, lr, nepoch, lbd, [0.9, 0.999], verbose)
time_dict['adam'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, adam algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, Adamloss[-1], acc))

ax[0].plot(np.arange(nepoch), Adamloss, label = 'adam z='+str(z))
Adamaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(Adamaccuracies, label = 'adam z='+str(z))
Adamerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(Adamerrors, label = 'adam z='+str(z))

### Adam Fixed LR

In [None]:
model = LinearSVM(m)
tic = time.time()

AdamLRloss, wts = adam(model, train_data, train_labels, lr, nepoch, lbd, [0.9, 0.999], verbose, adaptative_lr=False)
time_dict['adam_fixlr'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, adam with fixed lr algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, AdamLRloss[-1], acc))

ax[0].plot(np.arange(nepoch), AdamLRloss, label = 'adam_fixlr')
AdamLRaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(AdamLRaccuracies, label = 'adam_fixlr')
AdamLRerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(AdamLRerrors, label = 'adam_fixlr')

### Adam Projected

In [None]:
model = LinearSVM(m)
tic = time.time()

AdamProjloss, wts = adamproj(model, train_data, train_labels, lr, nepoch, lbd, z, [0.9, 0.999], verbose)
time_dict['adamproj'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, projected adam algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, AdamProjloss[-1], acc))

ax[0].plot(np.arange(nepoch), AdamProjloss, label = 'adamproj')
AdamProjaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(AdamProjaccuracies, label = 'adamproj')
AdamProjerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(AdamProjerrors, label = 'adamproj')

### Adamp

In [None]:
p = 3

model = LinearSVM(m)
tic = time.time()

AdamPloss, wts = adamP(model, train_data, train_labels, lr, nepoch, lbd, [0.9, 0.999], p, verbose)
time_dict['adamp'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, adam with norm L{:3d} algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, p, AdamPloss[-1], acc))

ax[0].plot(np.arange(nepoch), AdamPloss, label = 'adamp')
AdamPaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(AdamPaccuracies, label = 'adamp')
AdamPerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(AdamPerrors, label = 'adamp')

### Adam Temp

In [None]:
model = LinearSVM(m)
tic = time.time()

AdamTemploss, wts = adamTemporal(model, train_data, train_labels, lr, nepoch, lbd, [0.9, 0.999], verbose)
time_dict['adamtemp'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, adam with temporal averaging algorithm has a loss of{:1.6f} and accuracy {:1.6f}'.format(nepoch, AdamTemploss[-1], acc))

ax[0].plot(np.arange(nepoch), AdamTemploss, label = 'adamtemp')
AdamTempaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(AdamTempaccuracies, label = 'adamtemp')
AdamTemperrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(AdamTemperrors, label = 'adamtemp')

### Adamax

In [None]:
model = LinearSVM(m)
tic = time.time()

AdaMaxLoss, wts = adaMax(model, train_data, train_labels, lr, nepoch, lbd, [0.9, 0.999], verbose)
time_dict['adamax'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, AdaMax algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, AdaMaxLoss[-1], acc))

ax[0].plot(np.arange(nepoch), AdaMaxLoss, label = 'adamax')
AdaMaxaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(AdaMaxaccuracies, label = 'adamax')
AdaMaxerrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(AdaMaxerrors, label = 'adamax')

### Adamax Temp

In [None]:
model = LinearSVM(m)
tic = time.time()

AdaMaxTempLoss, wts = adaMaxTemporal(model, train_data, train_labels, lr, nepoch, lbd, [0.9, 0.999], verbose)
time_dict['adamaxtemp'] = (time.time() - tic)
pred_test_labels = model.predict(test_data)
acc = accuracy(test_labels, pred_test_labels)
print('After {:3d} epoch, AdaMax with temporal averaging algorithm has a loss of {:1.6f} and accuracy {:1.6f}'.format(nepoch, AdaMaxTempLoss[-1], acc))

ax[0].plot(np.arange(nepoch), AdaMaxTempLoss, label = 'adamaxtemp')
AdaMaxTempaccuracies = compute_accuracies(wts, test_data, test_labels)
ax[1].plot(AdaMaxTempaccuracies, label = 'adamaxtemp')
AdaMaxTemperrors = compute_errors(wts, test_data, test_labels)
ax[2].plot(AdaMaxTemperrors, label = 'adamaxtemp')

### Curves 

In [None]:
# Log scale
ax[0].set_xscale('log')
ax[0].set_yscale('logit')
ax[1].set_xscale('log')
ax[1].set_yscale('logit')
ax[2].set_xscale('log')
ax[2].set_yscale('logit')

# legend
ax[0].legend()
ax[1].legend()
ax[2].legend()
ax[0].set_title('Loss')
ax[1].set_title('Accuracy')
ax[2].set_title('Error')
ax[0].set_xlabel('Epochs')
ax[1].set_xlabel('Epochs')
ax[2].set_xlabel('Epochs')


plt.savefig('LossAccuraciesErrors.png')
plt.show()

In [None]:
plt.clf()
keys = list(time_dict.keys())
sns.barplot(x=keys, y=[time_dict[k]*20 for k in keys])
plt.savefig('execution_time.png')
plt.show()