# The Adam optimizer

The primary usefulness of the adam optimizer is its improved capabilites over other optimizers in almost all cases. To demonstrate this improvement we are going to implement several different optimizers on a basic toy problem.
But first, some set up:

In [19]:
import time
import torch
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torchmetrics.classification import Accuracy
from contextlib import contextmanager
import numpy as np
import torch
import torchvision.transforms as transforms
from torchmetrics.classification import Accuracy
import matplotlib.pyplot as plt
import torchmin

In [20]:
# function for measuring time taken curtises of ChatGPT 
@contextmanager
def time_block(label="Elapsed time"):
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print(f"{label} elapsed time: {end - start:.6f} seconds")

# function for measuring accuracy
def get_acc(X_test, y_test,classifier):
    num_classes = 10
    acc = Accuracy(task="multiclass", num_classes=num_classes)
    classifier.eval()
    with torch.no_grad():
        y_pred = classifier(torch.from_numpy(X_test).float())  # [899, 10] 
        y_true = torch.tensor(y_test)                           # [899]
        # Convert logits â†’ predicted class indices
        y_pred_labels = torch.argmax(y_pred, dim=1)             # [899]
    accuracy = acc(y_pred_labels, y_true).item()
    print(
        f"Final accuracy: {accuracy:.4f}"
    )

### Toy digits dataset

In [21]:
digits = load_digits()
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1)) #flattens the data
X_train, X_test, y_train, y_test = train_test_split( #splits into training and testing sets
    data, digits.target, test_size=0.5, shuffle=False
)

### Basic Classification Model

Now we are going to define a classification model that we are going to use, pretty basic, but we will be changing up the optimizer on this model to compare perforamces.

In [22]:
class classifier_model(torch.nn.Module):
    
    def __init__(self,input_size,output_size):
        super().__init__()
        self.layer1 = torch.nn.Sequential(torch.nn.Linear(input_size,output_size)) # single layer model
        
    def forward(self,X):
        z = self.layer1(X)         # Apply the first layer (and only)
        return(z)            # Return the result (and the latent space variable

#training a model automatically
def train_model(L,optimizer,classifier, X_train, y_train, display=False):
    n_epochs = 15
    for epoch in range(n_epochs):
        loss = 0
        for X,y in zip(X_train,y_train): #y are labels
            labels = torch.tensor(y)
            inputs = torch.from_numpy(X).to(torch.float32)
        
            optimizer.zero_grad()  # Clear gradients
            outputs = classifier(inputs) #gets the outputs of the model
            train_loss = L(outputs,labels) #gets loss
            train_loss.backward() # compute accumulated gradients
            optimizer.step()# perform parameter update based on current gradients
            loss += train_loss.item() # add the mini-batch training loss to epoch loss
        loss = loss / len(X_train) # compute the epoch training loss
        if display:
            print(f"epoch : {epoch + 1}/{n_epochs}, loss = {loss}")
    return loss #this is the final loss at the end of all the training process
            

In [23]:
classifier = classifier_model(64,10) #defines the model
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
train_model(L,optimizer,classifier, X_train, y_train)

0.008695103159739067

### Gradient Scaling
Unlike some basic optimizers, the Adam Optimizer is invariant to gradient scaling, so the default learning rate works well for most datasets. Run the optimizer on the dataset scaled datasets and compare the final accuracy. Is there any scalar that causes a noticable decrease in performance? Then, try the same (or different) scalars on Stochastic Gradient Descent. When does accuracy start to drop?

In [24]:
classifier = classifier_model(64,10) #defines the model
"""
Run the classifier with the Adam Optimizer here
?remove the next line? 
""" 
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train, y_train)
    get_acc(X_test, y_test,classifier)
    print(
        f"This is the final loss of the model: {final_loss:.4f} "
    )

Final accuracy: 0.9066
This is the final loss of the model: 0.0106 
Adam optimizer elapsed time: 1.242114 seconds


In [25]:
"""
Scale the training/testing data. Compare the accuracies.
"""
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
X_train_scaled = X_train/50
with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train_scaled, y_train)
    get_acc(X_test/50, y_test,classifier)


classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#####################
X_train_scaled = X_train*50
with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train_scaled, y_train)
    get_acc(X_test*50, y_test,classifier)

Final accuracy: 0.9043
Adam optimizer elapsed time: 1.282248 seconds
Final accuracy: 0.9055
Adam optimizer elapsed time: 1.233077 seconds


### Compare this with regular Stochastic Gradient Descent

In [26]:
classifier = classifier_model(64,10) #defines the model
"""
Now, run Stochastic Gradient Descent with at least twice with different scalars. Compare the change in accuracy to the Adam Optimizer.
"""
##################### Have people do?
optimizer = torch.optim.SGD(classifier.parameters(), lr=1e-3) 
L = torch.nn.CrossEntropyLoss()
#####################
with time_block("SGD optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train*(1/5), y_train)
    get_acc(X_test*(1/5), y_test,classifier)

classifier = classifier_model(64,10) #defines the model
with time_block("SGD optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train*(5), y_train)
    get_acc(X_test*(5), y_test,classifier)

classifier = classifier_model(64,10) #defines the model (!WITHOUT SCALING)
with time_block("SGD optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train, y_train)
    get_acc(X_test, y_test,classifier)

Final accuracy: 0.9099
SGD optimizer elapsed time: 0.874985 seconds
Final accuracy: 0.0945
SGD optimizer elapsed time: 0.809412 seconds
Final accuracy: 0.1346
SGD optimizer elapsed time: 0.811626 seconds


### If scaling does not affect the data, what about normalization? Compare the accuracy of the model before and after normalization. As a bonus, implement both with cross validation for more stable results!

In [27]:
"""
Normalize the training/testing data and run the classifier using the Adam optimizer
"""
classifier = classifier_model(64,10) #defines the model
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()

mean = np.mean(X_train, axis=0, keepdims=True) 
std  = np.std(X_train, axis=0, keepdims=True)   

eps = 1e-8
X_train_scaled = (X_train - mean) / (std + eps)
X_test_scaled  = (X_test  - mean) / (std + eps)


with time_block("Adam optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train_scaled, y_train)
    get_acc(X_test_scaled, y_test,classifier)

Final accuracy: 0.9232
Adam optimizer elapsed time: 1.242129 seconds


### The Adam Optimizer is based off of both RMSProp and AdaGrad.
Empirically, is there any difference in accuracy for this basic dataset?

In [28]:
"""
Run the classifier with RMSprop using the built-in torch function
"""
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.RMSprop(classifier.parameters(), lr=1e-3)
L = torch.nn.CrossEntropyLoss()
#####################
with time_block("RMSprop optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train, y_train)
    get_acc(X_test, y_test,classifier)

Final accuracy: 0.9188
RMSprop optimizer elapsed time: 1.111017 seconds


In [29]:
"""
Run the classifier with Adagrad using the built-in torch function. Adaprop specifically is known to have issues with step size decreasing too drastically. 
Does accuracy improve with a different learning rate?
"""
classifier = classifier_model(64,10) #defines the model
##################### Have people do?
optimizer = torch.optim.Adagrad(classifier.parameters(), lr=1e-2)
L = torch.nn.CrossEntropyLoss()
#####################
#train_model(L,optimizer,classifier)
with time_block("Adagrad optimizer"):
    final_loss = train_model(L,optimizer,classifier, X_train, y_train)
    get_acc(X_test, y_test,classifier)
    print(f"This is the final loss of the model: {final_loss}")

Final accuracy: 0.9177
This is the final loss of the model: 0.09698407760616795
Adagrad optimizer elapsed time: 1.077773 seconds


# Newtons method

What Adam does is related to a method called Newtons method.

Newtons method works by calcuating local curvuture information using second partial derivatives of your objective function to get a sense of what directions you can speed along, and what ones you want to move more carefully

In [30]:
#from torchmin.optim import MinimizeWrapper
import torchmin
#import pytorch_minimize

In [None]:
classifier = classifier_model(64,10) #defines the model
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3) #adam optimizer for the gradient decent
L = torch.nn.CrossEntropyLoss()
#torchmin.newton._minimize_newton_exact()
#from pytorch_minimize.optim import MinimizeWrapper



In [None]:
torchmin.newton._minimize_newton_exact(L(torch.from_numpy(X_train),torch.from_numpy(y_train)),) #Use this somehow? #!ASK DOUG MABYEder

AttributeError: 'generator' object has no attribute 'numel'

In [33]:
optimizer = MinimizeWrapper(
    model.parameters(),
    method="newton-cg",
    tol=1e-6,
    options={"maxiter": 20}
)

def closure():
    optimizer.zero_grad()
    preds = model(X)
    loss = criterion(preds, y)
    loss.backward(create_graph=True)
    return loss

optimizer.step(closure)

NameError: name 'MinimizeWrapper' is not defined

In [None]:
n_epochs = 15
for epoch in range(n_epochs):
    loss = 0
    for X,y in zip(X_train,y_train): #y are labels
        labels = torch.tensor(y)
        inputs = torch.from_numpy(X).to(torch.float32)
        
        optimizer.zero_grad()  # Clear gradients
        outputs = classifier(inputs) #gets the outputs of the model
        train_loss = L(outputs,labels) #gets loss
        train_loss.backward() # compute accumulated gradients
        optimizer.step()# perform parameter update based on current gradients
        loss += train_loss.item() # add the mini-batch training loss to epoch loss
    loss = loss / len(X_train) # compute the epoch training loss
    if display:
        print(f"epoch : {epoch + 1}/{n_epochs}, loss = {loss}")

One would think that if newtons method is so exact that we would want to use it more often. Unfortunualy the hessian is a slow matrix to calculate. As you more then likely noticed, using the full hessian is time consuming so we dont want to do that, additionally we have a dimensionality problem as you will see here:

# The Speed of the hessian matrix



In [None]:


###############################################delete this cell of code?
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)

def f(a, b):
    return a**2 * b

output = f(x, y)
df_dx = torch.autograd.grad(outputs=output, inputs=x, create_graph=True)[0]
print(f"First derivative df/dx: {df_dx.item()}")

df_dy = torch.autograd.grad(outputs=output, inputs=y, create_graph=True)[0]
print(f"First derivative df/dy: {df_dy.item()}")

#d2f_dy2 = torch.autograd.grad(outputs=df_dy, inputs=y,retain_graph=True,allow_unused=True)[0]
#print(f"Second derivative d^2f/dy^2: {d2f_dy2.item()}")

d2f_dx2 = torch.autograd.grad(outputs=df_dx, inputs=x,retain_graph=True)[0]
print(f"Second derivative d^2f/dx^2: {d2f_dx2.item()}")

d2f_dydx = torch.autograd.grad(outputs=df_dx, inputs=y,retain_graph=True)[0]
print(f"Mixed derivative d^2f/dydx: {d2f_dydx.item()}")

d2f_dxdy = torch.autograd.grad(outputs=df_dy, inputs=x,retain_graph=True)[0]
print(f"Mixed derivative d^2f/dxdy: {d2f_dxdy.item()}")



#d2f_dy2 = torch.autograd.grad(outputs=df_dy, inputs=y,retain_graph=True,allow_unused=True)[0]
#print(f"Mixed derivative d2f_dy2: {d2f_dy2.item()}")

First derivative df/dx: 12.0
First derivative df/dy: 4.0
Second derivative d^2f/dx^2: 6.0
Mixed derivative d^2f/dydx: 4.0
Mixed derivative d^2f/dxdy: 4.0


Some sore of activtiyy where they calucate the full hessian, the code isnt finalized or even working right now but thats the concept

In [None]:
import torch
from torch.func import hessian, jacfwd, jacrev
'''
# 1. Define a function that returns a single scalar loss
def compute_loss(model_params, input_data, targets, model, loss_fn):
    # Use functional_call to call the model with specific parameters
    outputs = torch.func.functional_call(model, model_params, input_data)
    loss = loss_fn(outputs, targets)
    return loss

# 2. Instantiate your model and loss function
model = torch.nn.Linear(10, 1) # Example model
loss_fn = torch.nn.MSELoss()    # Example loss function
input_data = torch.randn(1, 10)
targets = torch.randn(1, 1)

# 3. Extract parameters as a flat dictionary or tuple (functorch format)
# Note: make_functional is a utility to easily get functional model and params
from torch.func import make_functional
fnet, params = make_functional(model)

# 4. Compute the Hessian
# We need to wrap the loss computation in a function that takes params as input
def get_hessian_func(params):
    return compute_loss(params, input_data, targets, model, loss_fn)

# Calculate the full Hessian matrix
# argnums=0 specifies we want the Hessian with respect to the first argument (params)
full_hessian = hessian(get_hessian_func, argnums=0)(params)
'''

'\n# 1. Define a function that returns a single scalar loss\ndef compute_loss(model_params, input_data, targets, model, loss_fn):\n    # Use functional_call to call the model with specific parameters\n    outputs = torch.func.functional_call(model, model_params, input_data)\n    loss = loss_fn(outputs, targets)\n    return loss\n\n# 2. Instantiate your model and loss function\nmodel = torch.nn.Linear(10, 1) # Example model\nloss_fn = torch.nn.MSELoss()    # Example loss function\ninput_data = torch.randn(1, 10)\ntargets = torch.randn(1, 1)\n\n# 3. Extract parameters as a flat dictionary or tuple (functorch format)\n# Note: make_functional is a utility to easily get functional model and params\nfrom torch.func import make_functional\nfnet, params = make_functional(model)\n\n# 4. Compute the Hessian\n# We need to wrap the loss computation in a function that takes params as input\ndef get_hessian_func(params):\n    return compute_loss(params, input_data, targets, model, loss_fn)\n\n# Calc

In [None]:
def test_func(x):
    return (x.sin()).sum()
x = torch.randn(5) #takes in 5 inputs, so 5x5 hessian
y = torch.randn(5)

full_hessian = hessian(test_func, argnums=0)(x)
print(full_hessian)

tensor([[-0.8283, -0.0000, -0.0000, -0.0000, -0.0000],
        [-0.0000, -0.1660, -0.0000, -0.0000, -0.0000],
        [ 0.0000,  0.0000,  0.9312,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.3379,  0.0000],
        [-0.0000, -0.0000, -0.0000, -0.0000, -0.1332]])


The Hessian encodes local curvature of the objective function, it allows the gradient to be scaled so that 'dierctions' that have little movement are downscaled, while directions with more curvuture are scaled up so more progress is made.

However this means in high dimensions we are calcuating THOUSANDS of values to figure out this curvture, and for full felgded models, this is simply not computationally doable

In [None]:
torchmin.newton._minimize_newton_exact(test_func,torch.randn(5))

 message: Optimization terminated successfully.
 success: True
  status: 0
     fun: tensor(-5.)
       x: tensor([-1.5708, -1.5708, -1.5708, -1.5708, -1.5708])
     nit: 6
    grad: tensor([-4.3711e-08, -4.3711e-08, -4.3711e-08, -4.3711e-08, -4.3711e-08])
    hess: tensor([[1., 0., 0., 0., 0.],
                  [0., 1., 0., 0., 0.],
                  [0., 0., 1., 0., 0.],
                  [0., 0., 0., 1., 0.],
                  [0., 0., 0., 0., 1.]])
    nfev: 15
   nfail: 2