<a href="https://colab.research.google.com/github/TheS1n233/Distributed-Learning-Project5/blob/test/Distributed_Learning_Project5_HYPERPARAMSFINDING_ANDREA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install torch and torchvi

In [None]:
!pip install torch torchvision matplotlib


# install dataset CIFAR-100

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

# Data preprocessing
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Download CIFAR-100 dataset
train_dataset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)

# Split training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)



In [7]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import torchvision
import torchvision.transforms as transforms
import numpy as np
from PIL import Image

# Data preprocessing
class Cutout(object):
    def __init__(self, size):
        self.size = size

    def __call__(self, img):
        if isinstance(img, Image.Image):
            img = np.array(img)

        h, w = img.shape[:2]
        mask = np.ones((h, w), np.float32)
        y = np.random.randint(h)
        x = np.random.randint(w)
        y1 = np.clip(y - self.size // 2, 0, h)
        y2 = np.clip(y + self.size // 2, 0, h)
        x1 = np.clip(x - self.size // 2, 0, w)
        x2 = np.clip(x + self.size // 2, 0, w)
        mask[y1: y2, x1: x2] = 0
        img = img * mask[:, :, np.newaxis]

        return Image.fromarray(np.uint8(img))

# Data transformations
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    Cutout(size=8),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Download CIFAR-100 dataset
train_dataset = torchvision.datasets.CIFAR100(
    root='./data',
    train=True,
    download=True,
    transform=transform_train
)
test_dataset = torchvision.datasets.CIFAR100(
    root='./data',
    train=False,
    download=True,
    transform=transform_test
)

# Split training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define LeNet-5 model with adjustable dropout
class LeNet5(nn.Module):
    def __init__(self, dropout1=0.1, dropout2=0.1):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.dropout1 = nn.Dropout(dropout1)
        self.fc2 = nn.Linear(120, 84)
        self.dropout2 = nn.Dropout(dropout2)
        self.fc3 = nn.Linear(84, 100)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Define random search function with dropout
def random_search(train_loader, val_loader, model_class, device, num_trials=15):
    param_space = {
        'lr': [1e-4, 1e-3, 1e-2, 1e-1],
        'weight_decay': [1e-5, 1e-4, 1e-3, 1e-2, 5e-4, 5e-3],
        'momentum': [0.9, 0.95],
        'dropout1': [0.1, 0.2, 0.3, 0.4, 0.5],
        'dropout2': [0.1, 0.2, 0.3, 0.4, 0.5],
        'patience': [3, 5, 10]
    }

    results = []

    for trial in range(num_trials):
        print(f"Trial {trial + 1}/{num_trials}")
        lr = random.choice(param_space['lr'])
        weight_decay = random.choice(param_space['weight_decay'])
        momentum = random.choice(param_space['momentum'])
        dropout1 = random.choice(param_space['dropout1'])
        dropout2 = random.choice(param_space['dropout2'])
        patience = random.choice(param_space['patience'])

        model = model_class(dropout1=dropout1, dropout2=dropout2).to(device)
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
        scheduler = CosineAnnealingLR(optimizer, T_max=5)
        criterion = nn.CrossEntropyLoss()

        train_losses, val_losses, train_accuracies, val_accuracies = train_model_with_early_stopping(
            model, optimizer, criterion, scheduler, train_loader, val_loader, num_epochs=5, device=device, patience=patience
        )

        results.append({
            'lr': lr,
            'weight_decay': weight_decay,
            'momentum': momentum,
            'dropout1': dropout1,
            'dropout2': dropout2,
            'val_losses': val_losses,
            'train_losses':train_losses,
            'train_accuracies': train_accuracies,
            'val_accuracies': val_accuracies,
            'patience': patience
        })

    results.sort(key=lambda x: x['val_acc'], reverse=True)
    print("\nBest Hyperparameters:")
    print(results[0])

    return results[0]  # Return the best result

def train_model_with_early_stopping(model, optimizer, criterion, scheduler, train_loader, val_loader, num_epochs, device, patience=5):
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    best_val_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss, correct, total = 0, 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(100. * correct / total)

        # Validation phase
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

        val_losses.append(val_loss / len(val_loader))
        val_accuracies.append(100. * correct / total)

        scheduler.step()

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracies[-1]:.2f}%, "
              f"Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.2f}%")

        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

    return train_losses, val_losses, train_accuracies, val_accuracies


# Main function
if __name__ == "__main__":
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    best_hyperparams = random_search(
        train_loader=train_loader,
        val_loader=val_loader,
        model_class=LeNet5,
        device=device,
        num_trials=10
    )

    print("\nUsing Best Hyperparameters:")
    print(best_hyperparams)


Files already downloaded and verified
Files already downloaded and verified
Trial 1/10
Epoch 1/5, Train Loss: 4.6081, Train Acc: 1.05%, Val Loss: 4.6075, Val Acc: 0.99%
Epoch 2/5, Train Loss: 4.6070, Train Acc: 0.96%, Val Loss: 4.6070, Val Acc: 0.99%
Epoch 3/5, Train Loss: 4.6065, Train Acc: 1.01%, Val Loss: 4.6065, Val Acc: 1.01%
Epoch 4/5, Train Loss: 4.6062, Train Acc: 1.03%, Val Loss: 4.6065, Val Acc: 0.96%
Epoch 5/5, Train Loss: 4.6061, Train Acc: 0.97%, Val Loss: 4.6064, Val Acc: 0.95%
Trial 2/10
Epoch 1/5, Train Loss: 4.5960, Train Acc: 1.40%, Val Loss: 4.5514, Val Acc: 1.86%
Epoch 2/5, Train Loss: 4.5036, Train Acc: 2.56%, Val Loss: 4.4375, Val Acc: 3.61%
Epoch 3/5, Train Loss: 4.3683, Train Acc: 3.55%, Val Loss: 4.2628, Val Acc: 3.93%


KeyboardInterrupt: 

In [None]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
import torchvision
import torchvision.transforms as transforms
import numpy as np
from PIL import Image
import math

# Data preprocessing
class Cutout(object):
    def __init__(self, size):
        self.size = size

    def __call__(self, img):
        if isinstance(img, Image.Image):
            img = np.array(img)

        h, w = img.shape[:2]
        mask = np.ones((h, w), np.float32)
        y = np.random.randint(h)
        x = np.random.randint(w)
        y1 = np.clip(y - self.size // 2, 0, h)
        y2 = np.clip(y + self.size // 2, 0, h)
        x1 = np.clip(x - self.size // 2, 0, w)
        x2 = np.clip(x + self.size // 2, 0, w)
        mask[y1: y2, x1: x2] = 0
        img = img * mask[:, :, np.newaxis]

        return Image.fromarray(np.uint8(img))

# Data transformations
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    Cutout(size=8),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

def load_cifar100():
    """Load and prepare CIFAR-100 dataset"""
    train_dataset = torchvision.datasets.CIFAR100(
        root='./data',
        train=True,
        download=True,
        transform=transform_train
    )
    test_dataset = torchvision.datasets.CIFAR100(
        root='./data',
        train=False,
        download=True,
        transform=transform_test
    )

    # Split training and validation sets
    train_size = int(0.8 * len(train_dataset))
    val_size = len(train_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

    # Create data loaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

    return train_loader, val_loader, test_loader

# Define LeNet-5 model
class LeNet5(nn.Module):
    def __init__(self, dropout1=0.1, dropout2=0.1):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.dropout1 = nn.Dropout(dropout1)
        self.fc2 = nn.Linear(120, 84)
        self.dropout2 = nn.Dropout(dropout2)
        self.fc3 = nn.Linear(84, 100)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

def check_initial_loss(model, train_loader, device):
    """Step 1: Check initial loss at initialization"""
    model.eval()
    with torch.no_grad():
        inputs, labels = next(iter(train_loader))
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = F.cross_entropy(outputs, labels)
        expected_loss = math.log(100)  # For CIFAR-100 with 100 classes
        print(f"Initial loss: {loss.item():.4f}")
        print(f"Expected loss: {expected_loss:.4f}")
        return abs(loss.item() - expected_loss) < 1.0

def overfit_small_sample(model, train_loader, device, max_iterations=1000):
    """Step 2: Overfit a small sample"""
    # Get small sample (5 minibatches)
    small_dataset = []
    for i, (inputs, labels) in enumerate(train_loader):
        if i >= 5:
            break
        small_dataset.extend(list(zip(inputs, labels)))

    small_loader = torch.utils.data.DataLoader(small_dataset, batch_size=64, shuffle=True)

    optimizer = optim.SGD(model.parameters(), lr=0.01)
    model.train()

    for iteration in range(max_iterations):
        total_loss = 0
        correct = 0
        total = 0

        for inputs, labels in small_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        accuracy = 100. * correct / total
        avg_loss = total_loss / len(small_loader)

        if iteration % 50 == 0:
            print(f"Iteration {iteration}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

        if accuracy > 99:
            print("Successfully overfit small sample!")
            return True

    return False

def find_initial_lr(model, train_loader, device, lrs=[1e-1, 1e-2, 1e-3, 1e-4]):
    """Step 3: Find LR that makes loss go down"""
    best_lr = None
    min_loss_decrease = float('inf')

    for lr in lrs:
        model.apply(lambda m: m.reset_parameters() if hasattr(m, 'reset_parameters') else None)
        optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-5)
        initial_loss = None
        final_loss = None

        print(f"\nTrying learning rate: {lr}")

        for iteration in range(100):
            inputs, labels = next(iter(train_loader))
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, labels)

            if iteration == 0:
                initial_loss = loss.item()
            final_loss = loss.item()

            loss.backward()
            optimizer.step()

            if iteration % 20 == 0:
                print(f"Iteration {iteration}, Loss: {loss.item():.4f}")

        loss_decrease = initial_loss - final_loss
        print(f"Loss decrease: {loss_decrease:.4f}")

        if loss_decrease > 0 and loss_decrease < min_loss_decrease:
            min_loss_decrease = loss_decrease
            best_lr = lr

    return best_lr

def coarse_grid_search(model, train_loader, val_loader, device, best_lr):
    """Step 4: Coarse grid search"""
    lrs = [best_lr / 2, best_lr, best_lr * 2]
    weight_decays = [0, 1e-5, 1e-4]
    results = []

    for lr in lrs:
        for wd in weight_decays:
            print(f"\nTrying LR: {lr}, Weight Decay: {wd}")
            model.apply(lambda m: m.reset_parameters() if hasattr(m, 'reset_parameters') else None)
            optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=wd)

            train_losses = []
            val_accuracies = []

            for epoch in range(5):
                # Training
                model.train()
                total_loss = 0
                for inputs, labels in train_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = F.cross_entropy(outputs, labels)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()

                # Validation
                model.eval()
                correct = 0
                total = 0
                with torch.no_grad():
                    for inputs, labels in val_loader:
                        inputs, labels = inputs.to(device), labels.to(device)
                        outputs = model(inputs)
                        _, predicted = outputs.max(1)
                        total += labels.size(0)
                        correct += predicted.eq(labels).sum().item()

                train_losses.append(total_loss / len(train_loader))
                val_accuracies.append(100. * correct / total)

                print(f"Epoch {epoch + 1}, Loss: {train_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.2f}%")

            results.append({
                'lr': lr,
                'weight_decay': wd,
                'final_loss': train_losses[-1],
                'final_val_acc': val_accuracies[-1]
            })

    results.sort(key=lambda x: x['final_val_acc'], reverse=True)
    return results[0]

def train_model_with_early_stopping(model, optimizer, criterion, scheduler, train_loader, val_loader, num_epochs, device, patience=5):
    """Train model with early stopping"""
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    best_val_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss, correct, total = 0, 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(100. * correct / total)

        # Validation phase
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

        val_losses.append(val_loss / len(val_loader))
        val_accuracies.append(100. * correct / total)

        scheduler.step()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, "
              f"Train Acc: {train_accuracies[-1]:.2f}%, "
              f"Val Loss: {val_losses[-1]:.4f}, "
              f"Val Acc: {val_accuracies[-1]:.2f}%")

        if epochs_without_improvement >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

    return train_losses, val_losses, train_accuracies, val_accuracies

def refined_training(model, train_loader, val_loader, device, best_params, num_epochs=20):
    """Step 5 & 6: Refined training and learning curves"""
    model.apply(lambda m: m.reset_parameters() if hasattr(m, 'reset_parameters') else None)
    optimizer = optim.SGD(model.parameters(),
                         lr=best_params['lr'],
                         weight_decay=best_params['weight_decay'])
    scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
    criterion = nn.CrossEntropyLoss()

    return train_model_with_early_stopping(
        model, optimizer, criterion, scheduler,
        train_loader, val_loader, num_epochs, device
    )

def main():
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load CIFAR-100 data
    train_loader, val_loader, test_loader = load_cifar100()

    # Initialize model
    model = LeNet5().to(device)

    # Step 1: Check initial loss
    if check_initial_loss(model, train_loader, device):
        print("Initial loss check passed.")

    # Step 2: Overfit small sample
    if overfit_small_sample(model, train_loader, device):
        print("Successfully overfit small sample.")

    # Step 3: Find learning rate
    best_lr = find_initial_lr(model, train_loader, device)
    print(f"Best learning rate: {best_lr}")

    # Step 4: Coarse grid search for LR and weight decay
    best_model_params = coarse_grid_search(model, train_loader, val_loader, device, best_lr)
    print(f"Best model parameters: {best_model_params}")

    # Step 5: Refine grid search and train for longer
    # Train with the best parameters found
    best_lr = best_model_params['lr']
    best_weight_decay = best_model_params['weight_decay']
    model.apply(lambda m: m.reset_parameters() if hasattr(m, 'reset_parameters') else None)
    optimizer = optim.SGD(model.parameters(), lr=best_lr, weight_decay=best_weight_decay)
    scheduler = CosineAnnealingLR(optimizer, T_max=10)  # Adjust T_max for your needs

    # Train the model for 10-20 epochs
    model, train_losses, val_losses, train_accuracies, val_accuracies = train_model_with_early_stopping(
        model, optimizer, F.cross_entropy, scheduler, train_loader, val_loader, num_epochs=20, device=device, patience=5
    )

    # Step 6: Look at learning curves
    # Plot training and validation losses and accuracies
    import matplotlib.pyplot as plt
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label="Train Accuracy")
    plt.plot(epochs, val_accuracies, label="Validation Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    main()

Files already downloaded and verified
Files already downloaded and verified
Initial loss: 4.6130
Expected loss: 4.6052
Initial loss check passed.
Iteration 0, Loss: 4.6086, Accuracy: 0.62%
Iteration 50, Loss: 4.5957, Accuracy: 3.12%
Iteration 100, Loss: 4.5792, Accuracy: 2.50%
Iteration 150, Loss: 4.5498, Accuracy: 2.81%
Iteration 200, Loss: 4.4364, Accuracy: 4.06%
Iteration 250, Loss: 4.2377, Accuracy: 5.31%
Iteration 300, Loss: 3.6982, Accuracy: 13.12%
Iteration 350, Loss: 2.8516, Accuracy: 26.56%
Iteration 400, Loss: 1.6203, Accuracy: 53.75%
Iteration 450, Loss: 0.8472, Accuracy: 74.38%
Iteration 500, Loss: 0.3560, Accuracy: 89.38%
Iteration 550, Loss: 0.2176, Accuracy: 95.62%
Iteration 600, Loss: 0.1298, Accuracy: 97.50%
Iteration 650, Loss: 0.0843, Accuracy: 98.12%
Successfully overfit small sample!
Successfully overfit small sample.

Trying learning rate: 0.1
Iteration 0, Loss: 4.6203
Iteration 20, Loss: 4.6258
Iteration 40, Loss: 4.6053
Iteration 60, Loss: 4.6274
Iteration 80, L

# Centralized baseline

In [None]:
# define LeNet-5 model
class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.dropout1 = nn.Dropout(0.1)
        self.fc2 = nn.Linear(120, 84)
        self.dropout2 = nn.Dropout(0.1)
        self.fc3 = nn.Linear(84, 100)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x


# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training function
def train_model(model, optimizer, criterion, scheduler, train_loader, val_loader, num_epochs):
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss, correct, total = 0, 0, 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_losses.append(train_loss / len(train_loader))
        train_accuracies.append(100. * correct / total)

        # Validation phase
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                total += labels.size(0)
                correct += predicted.eq(labels).sum().item()

        val_losses.append(val_loss / len(val_loader))
        val_accuracies.append(100. * correct / total)

        scheduler.step()

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_losses[-1]:.4f}, Train Acc: {train_accuracies[-1]:.2f}%, Val Loss: {val_losses[-1]:.4f}, Val Acc: {val_accuracies[-1]:.2f}%")

    return train_losses, val_losses, train_accuracies, val_accuracies

# Test function
def test_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    print(f"Test Accuracy: {100. * correct / total:.2f}%")

# Training configurations
num_epochs = 150
criterion = nn.CrossEntropyLoss()

# Train with AdamW
model = LeNet5().to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.05)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
print("Training with AdamW optimizer...")
train_losses_adamw, val_losses_adamw, train_acc_adamw, val_acc_adamw = train_model(model, optimizer, criterion, scheduler, train_loader, val_loader, num_epochs)
test_model(model, test_loader)

# Plot training and validation results
def plot_results(train_losses, val_losses, train_accuracies, val_accuracies, title):
    epochs = range(1, num_epochs + 1)
    plt.figure(figsize=(12, 5))

    # Plot losses
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label='Train Loss')
    plt.plot(epochs, val_losses, label='Val Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title(f'{title} Loss')
    plt.legend()

    # Plot accuracies
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label='Train Accuracy')
    plt.plot(epochs, val_accuracies, label='Val Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.title(f'{title} Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Plot results for AdamW
plot_results(train_losses_adamw, val_losses_adamw, train_acc_adamw, val_acc_adamw, 'AdamW')