In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split

# Load the MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)

# Split the training dataset into training and validation sets
train_data, val_data = train_test_split(train_dataset, test_size=0.2, random_state=42, shuffle=True)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=False)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define the neural network architecture
class FeedForwardNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, activation=nn.ReLU()):
        super(FeedForwardNN, self).__init__()
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_sizes[0])])
        for i in range(len(hidden_sizes) - 1):
            self.hidden_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
        self.output_layer = nn.Linear(hidden_sizes[-1], output_size)
        self.activation = activation

    def forward(self, x):
        for layer in self.hidden_layers:
            x = self.activation(layer(x))
        x = self.output_layer(x)
        return x

def train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, labels in train_loader:
            images = images.view(images.shape[0], -1)  # Flatten the images
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")
        
        # Validate the model
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.view(images.shape[0], -1)  # Flatten the images
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_accuracy = 100 * correct / total
        print(f"Validation Accuracy: {val_accuracy:.2f}%")

# Hyperparameter tuning - Grid search for learning rate
learning_rates = [0.001, 0.01, 0.1]
best_accuracy = 0.0
best_lr = None

for lr in learning_rates:
    print(f"Testing learning rate: {lr}")
    
    # Define the model
    model = FeedForwardNN(input_size=28*28, hidden_sizes=[128, 64], output_size=10, activation=nn.ReLU())
    
    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Train the model
    train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=5)
    
    # Evaluate the model on the validation set
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.view(images.shape[0], -1)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_accuracy = 100 * correct / total
    print(f"Validation Accuracy: {val_accuracy:.2f}%")

    # Check if this learning rate gives better accuracy
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_lr = lr

print(f"Best learning rate: {best_lr}, Best Validation Accuracy: {best_accuracy:.2f}%")

# Now, train the model with the best learning rate on the combined training and validation data
model = FeedForwardNN(input_size=28*28, hidden_sizes=[128, 64], output_size=10, activation=nn.ReLU())
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=best_lr)

train_model(model, criterion, optimizer, train_loader, val_loader, num_epochs=10)

# Evaluate the model on the test set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(images.shape[0], -1)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = 100 * correct / total
print(f"Test Accuracy: {test_accuracy:.2f}%")


Testing learning rate: 0.001
Epoch 1/5, Loss: 0.427487736483415
Validation Accuracy: 92.51%
Epoch 2/5, Loss: 0.20844467053065696
Validation Accuracy: 94.92%
Epoch 3/5, Loss: 0.1487502063587308
Validation Accuracy: 95.77%
Epoch 4/5, Loss: 0.11957685679197311
Validation Accuracy: 96.47%
Epoch 5/5, Loss: 0.10031005004917581
Validation Accuracy: 96.45%
Validation Accuracy: 96.45%
Testing learning rate: 0.01
Epoch 1/5, Loss: 0.41942750759919484
Validation Accuracy: 92.31%
Epoch 2/5, Loss: 0.26168215811252593
Validation Accuracy: 92.83%
Epoch 3/5, Loss: 0.23960625640302896
Validation Accuracy: 94.03%
Epoch 4/5, Loss: 0.2234372730900844
Validation Accuracy: 92.89%
Epoch 5/5, Loss: 0.21733630555247266
Validation Accuracy: 93.47%
Validation Accuracy: 93.47%
Testing learning rate: 0.1
Epoch 1/5, Loss: 3.007503330071767
Validation Accuracy: 10.92%
Epoch 2/5, Loss: 2.307611120859782
Validation Accuracy: 10.82%
Epoch 3/5, Loss: 2.3091203292210896
Validation Accuracy: 11.02%
Epoch 4/5, Loss: 2.30984