# The Adam optimizer

The primary usefulness of the adam optimizer is its improved capabilites over other optimizers in almost all cases. To demonstrate this improvement we are going to implement several different optimizers on a basic toy problem.

### But first, some set up:

In [2]:
import time
from contextlib import contextmanager
# function for measuring time taken curtises of ChatGPT (!MAY NOT USE, WE SHALL SEE)
@contextmanager
def time_block(label="Elapsed time"):
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print(f"{label}: {end - start:.6f} seconds")

In [3]:
with time_block("My slow code"):
    total = sum(i * i for i in range(10_000_000))

My slow code: 0.445547 seconds


### Toy digits dataset

In [4]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits()
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1)) #flattens the data
X_train, X_test, y_train, y_test = train_test_split( #splits into training and testing sets
    data, digits.target, test_size=0.5, shuffle=False
)

In [5]:
len(X_train[0])

64

### Classification model

In [23]:
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from torchmetrics.classification import Accuracy
import matplotlib.pyplot as plt

class classifier_model(torch.nn.Module):
    
    def __init__(self,input_size,output_size):
        super().__init__()
        self.layer1 = torch.nn.Sequential(torch.nn.Linear(input_size,output_size)) # single layer model
        
    def forward(self,X):
        z = self.layer1(X)         # Apply the first layer (and only)
        return(z)            # Return the result (and the latent space variable)
            
classifier = classifier_model(64,10) #defines the model

optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent

L = torch.nn.CrossEntropyLoss()

In [31]:
#training
def train_model(L,optimizer,classifier,display=False):
    n_epochs = 15
    for epoch in range(n_epochs):
        loss = 0
        for X,y in zip(X_train,y_train): #y are labels
            labels = torch.tensor(y)
            inputs = torch.from_numpy(X).to(torch.float32)
        
            optimizer.zero_grad()  # Clear gradients
            outputs = classifier(inputs) #gets the outputs of the model
            train_loss = L(outputs,labels) #gets loss
            #print(train_loss)
            train_loss.backward() # compute accumulated gradients
            optimizer.step()# perform parameter update based on current gradients
            loss += train_loss.item() # add the mini-batch training loss to epoch loss
        loss = loss / len(X_train) # compute the epoch training loss
        if display:
            print(f"epoch : {epoch + 1}/{n_epochs}, loss = {loss}")
    return loss #this is the final loss at the end of all the training process
train_model(L,optimizer,classifier)

0.009929288514956924

In [40]:
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier)
    print(f"This is the final loss of the model: {final_loss}")

This is the final loss of the model: 0.009767960801593905
Adam optimizer: 1.351703 seconds


In [36]:
#torch.optim.LBFGS
#torch.optim.RMSprop
import torchmin #this is how you get the newtons method full 

classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.RMSprop(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
with time_block("RMSprop optimizer"):
    final_loss = train_model(L,optimizer,classifier)
    print(f"This is the final loss of the model: {final_loss}")

This is the final loss of the model: 0.018097349111964353
RMSprop optimizer: 1.147118 seconds


In [37]:
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.SGD(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
#train_model(L,optimizer,classifier)
with time_block("SGD optimizer"):
    final_loss = train_model(L,optimizer,classifier)
    print(f"This is the final loss of the model: {final_loss}")

This is the final loss of the model: 0.024601183763795054
SGD optimizer: 0.868933 seconds


In [38]:
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adagrad(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
#train_model(L,optimizer,classifier)
with time_block("Adagrad optimizer"):
    final_loss = train_model(L,optimizer,classifier)
    print(f"This is the final loss of the model: {final_loss}")

This is the final loss of the model: 1.4305289820944247
Adagrad optimizer: 1.105089 seconds


# The Speed of the hessian matrix

One would think that if newtons method is so exact that we would want to use it more often. Unfortunualy the hessian is a slow matrix to calculate as you will see here:

In [10]:
#inputs
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

def f(a, b):
    return a**2 * b

output = f(x, y)
df_dx = torch.autograd.grad(outputs=output, inputs=x, create_graph=True)[0]
print(f"First derivative df/dx: {df_dx.item()}")

df_dy = torch.autograd.grad(outputs=output, inputs=y, create_graph=True)[0]
print(f"First derivative df/dy: {df_dy.item()}")

#d2f_dy2 = torch.autograd.grad(outputs=df_dy, inputs=y,retain_graph=True,allow_unused=True)[0]
#print(f"Second derivative d^2f/dy^2: {d2f_dy2.item()}")

d2f_dx2 = torch.autograd.grad(outputs=df_dx, inputs=x,retain_graph=True)[0]
print(f"Second derivative d^2f/dx^2: {d2f_dx2.item()}")

d2f_dydx = torch.autograd.grad(outputs=df_dx, inputs=y,retain_graph=True)[0]
print(f"Mixed derivative d^2f/dydx: {d2f_dydx.item()}")

d2f_dxdy = torch.autograd.grad(outputs=df_dy, inputs=x,retain_graph=True)[0]
print(f"Mixed derivative d^2f/dxdy: {d2f_dxdy.item()}")



#d2f_dy2 = torch.autograd.grad(outputs=df_dy, inputs=y,retain_graph=True,allow_unused=True)[0]
#print(f"Mixed derivative d2f_dy2: {d2f_dy2.item()}")

First derivative df/dx: 12.0
First derivative df/dy: 4.0
Second derivative d^2f/dx^2: 6.0
Mixed derivative d^2f/dydx: 4.0
Mixed derivative d^2f/dxdy: 4.0
