# MNIST Classification with ARP Optimizer

This notebook demonstrates using the Adaptive Resistance-Potential (ARP) optimizer for training a neural network on the MNIST handwritten digit dataset.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np
from time import time

# Import the ARP optimizer
from adaptive_dynamics.arp.optimizers import ARP

## 1. Data Loading and Preprocessing

In [None]:
# Define transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # MNIST mean and std
])

# Load MNIST dataset
train_dataset = datasets.MNIST('data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('data', train=False, transform=transform)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

## 2. Define Neural Network Model

In [None]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Create an instance of the model
model = SimpleNN()
print(model)

## 3. Set Up Optimizer and Loss Function

In [None]:
# Define loss function
criterion = nn.CrossEntropyLoss()

# Create ARP optimizer
optimizer = ARP(
    model.parameters(),
    lr=1e-3,        # Learning rate
    alpha=0.01,      # Adaptation rate
    mu=0.001,       # Decay rate
    weight_decay=1e-5  # L2 regularization
)

## 4. Training Loop

In [None]:
# Training function
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(data)
        
        # Calculate loss
        loss = criterion(output, target)
        
        # Backward pass
        loss.backward()
        
        # Update weights with ARP
        optimizer.step()
        
        # Record statistics
        train_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()
        
        # Print progress
        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                  f'({100. * batch_idx / len(train_loader):.0f}%)]\t'
                  f'Loss: {loss.item():.6f}\t'
                  f'Accuracy: {100. * correct / total:.2f}%')
    
    # Return average loss and accuracy for the epoch
    return train_loss / len(train_loader), correct / total

# Evaluation function
def evaluate(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    
    test_loss /= len(test_loader)
    accuracy = correct / len(test_loader.dataset)
    
    print(f'\nTest set: Average loss: {test_loss:.4f}, '
          f'Accuracy: {correct}/{len(test_loader.dataset)} ({100. * accuracy:.2f}%)\n')
    
    return test_loss, accuracy

## 5. Train the Model

In [None]:
# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

# Set number of epochs
num_epochs = 5

# Lists to store metrics
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

# Training loop
start_time = time()
for epoch in range(1, num_epochs + 1):
    print(f"\nEpoch {epoch}/{num_epochs}")
    print("-" * 50)
    
    # Train
    train_loss, train_acc = train(model, device, train_loader, optimizer, criterion, epoch)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    
    # Evaluate
    test_loss, test_acc = evaluate(model, device, test_loader, criterion)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)

training_time = time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

## 6. Plot Training and Evaluation Metrics

In [None]:
# Plot training and testing loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_losses, 'bo-', label='Training Loss')
plt.plot(range(1, num_epochs + 1), test_losses, 'ro-', label='Test Loss')
plt.title('Loss vs. Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), [acc * 100 for acc in train_accuracies], 'bo-', label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), [acc * 100 for acc in test_accuracies], 'ro-', label='Test Accuracy')
plt.title('Accuracy vs. Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## 7. Visualize ARP's Adaptive Learning Rate

In [None]:
# Get adaptive learning rates
adaptive_lr_dict = optimizer.get_adaptive_lr()

# Collect learning rates for each layer
layer_names = []
avg_learning_rates = []

for name, param in model.named_parameters():
    if param in adaptive_lr_dict:
        lr_tensor = adaptive_lr_dict[param]
        avg_lr = lr_tensor.mean().item()
        layer_names.append(name)
        avg_learning_rates.append(avg_lr)

# Plot average learning rate for each layer
plt.figure(figsize=(10, 6))
plt.barh(layer_names, avg_learning_rates)
plt.xlabel('Average Effective Learning Rate')
plt.ylabel('Layer')
plt.title('ARP: Adaptive Learning Rates by Layer')
plt.tight_layout()
plt.show()

## 8. Visualize Predictions on Test Images

In [None]:
# Function to plot images with predictions
def plot_predictions(model, device, test_loader, num_images=10):
    model.eval()
    fig = plt.figure(figsize=(12, 6))
    
    dataiter = iter(test_loader)
    images, labels = next(dataiter)
    
    with torch.no_grad():
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        
    # Plot images
    for i in range(num_images):
        plt.subplot(2, 5, i + 1)
        plt.imshow(images[i].cpu().numpy().squeeze(), cmap='gray')
        title = f"Pred: {predicted[i].item()}\nTrue: {labels[i].item()}"
        color = 'green' if predicted[i] == labels[i] else 'red'
        plt.title(title, color=color)
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

# Visualize predictions
plot_predictions(model, device, test_loader)

## 9. Compare with Standard Optimizers (Optional)

In [None]:
def train_compare_optimizers(opt_name, optimizer, epochs=3):
    # Create a new model
    model = SimpleNN().to(device)
    
    # Lists to store metrics
    train_losses = []
    test_accuracies = []
    
    # Training loop
    start_time = time()
    for epoch in range(1, epochs + 1):
        print(f"\n{opt_name} - Epoch {epoch}/{epochs}")
        print("-" * 50)
        
        # Train
        train_loss, _ = train(model, device, train_loader, optimizer, criterion, epoch)
        train_losses.append(train_loss)
        
        # Evaluate
        _, test_acc = evaluate(model, device, test_loader, criterion)
        test_accuracies.append(test_acc)

    training_time = time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    return train_losses, test_accuracies, training_time

# Compare ARP with Adam and SGD
# Uncomment to run this comparison
"""
print("Training with ARP optimizer")
arp_opt = ARP(SimpleNN().parameters(), lr=1e-3, alpha=0.01, mu=0.001)
arp_losses, arp_accs, arp_time = train_compare_optimizers("ARP", arp_opt)

print("Training with Adam optimizer")
adam_opt = torch.optim.Adam(SimpleNN().parameters(), lr=1e-3)
adam_losses, adam_accs, adam_time = train_compare_optimizers("Adam", adam_opt)

print("Training with SGD optimizer")
sgd_opt = torch.optim.SGD(SimpleNN().parameters(), lr=0.01, momentum=0.9)
sgd_losses, sgd_accs, sgd_time = train_compare_optimizers("SGD", sgd_opt)

# Plot comparison
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(arp_losses) + 1), arp_losses, 'b-', label=f'ARP ({arp_time:.2f}s)')
plt.plot(range(1, len(adam_losses) + 1), adam_losses, 'r-', label=f'Adam ({adam_time:.2f}s)')
plt.plot(range(1, len(sgd_losses) + 1), sgd_losses, 'g-', label=f'SGD ({sgd_time:.2f}s)')
plt.title('Training Loss Comparison')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(arp_accs) + 1), [acc * 100 for acc in arp_accs], 'b-', label='ARP')
plt.plot(range(1, len(adam_accs) + 1), [acc * 100 for acc in adam_accs], 'r-', label='Adam')
plt.plot(range(1, len(sgd_accs) + 1), [acc * 100 for acc in sgd_accs], 'g-', label='SGD')
plt.title('Test Accuracy Comparison')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()
"""

## Conclusion

In this notebook, we've demonstrated using the ARP optimizer from the Adaptive Dynamics Toolkit to train a neural network on the MNIST dataset. The ARP optimizer adapts learning rates based on gradient history using a conductance-like mechanism, potentially providing benefits over traditional optimizers in certain scenarios.

Key takeaways:
1. The ARP optimizer is easy to use as a drop-in replacement for standard optimizers
2. Its adaptive learning rate mechanism allows it to adjust to the training dynamics
3. The conductance state provides a form of memory similar to momentum but based on different principles

For more advanced use cases, you can experiment with different values for the adaptation rate (alpha) and decay rate (mu) parameters to fine-tune the optimizer's behavior for your specific problem.