### Import the required packages

In [4]:
import torch
import random
import torchvision
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, random_split

### Load the data and create the Train, Validation and Test datasets

In [5]:
# Used for finding the mean and std of the dataset
#mean = X_train.train_data.float().mean() / 255 # = 0.1307
#std = X_train.train_data.float().std() / 255 # = 0.3081

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

full_train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    transform=transform
)

train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

### Check the data

In [None]:
print(f'train_dataset data shape: {train_dataset.dataset.data[train_dataset.indices].shape}')
print(f'train_dataset targets shape: {train_dataset.dataset.targets[train_dataset.indices].shape}')

print(f'val_dataset data shape: {val_dataset.dataset.data[val_dataset.indices].shape}')
print(f'val_dataset targets shape: {val_dataset.dataset.targets[val_dataset.indices].shape}')

print(f'test_dataset data shape: {test_dataset.data.shape}')
print(f'test_datset targets shape: {test_dataset.targets.shape}')

print(f'Classes: {train_dataset.dataset.classes}')

for i in range(6):
    plt.subplot(2, 3, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(train_dataset.dataset.data[train_dataset.indices][i], cmap='gray')
    plt.xlabel(f'Label: {train_dataset.dataset.targets[train_dataset.indices][i]}')

### Check the data distribution

In [None]:
y_train = train_dataset.dataset.targets[train_dataset.indices].numpy()
y_val = val_dataset.dataset.targets[val_dataset.indices].numpy()
y_test = test_dataset.targets.numpy()
    
datasets = {
    'Train': y_train,
    'Val': y_val,
    'Test': y_test
}
    
data = []
for set_name, y_data in datasets.items():
    unique, counts = np.unique(y_data, return_counts=True)
    for digit, count in zip(unique, counts):
        data.append({
            'Dataset': set_name,
            'Digit': digit,
            'Count': count
        })
    
df = pd.DataFrame(data)
    
plt.figure(figsize=(15, 8))
df_pivot = df.pivot(index='Digit', columns='Dataset', values='Count')
df_pivot.plot(kind='bar', width=0.8)
    
plt.title('Digit distribution in MNIST datasets')
plt.xlabel('Digit')
plt.ylabel('Count')
plt.legend(title='Dataset', loc='upper right')
plt.grid(True, alpha=0.3)
    
plt.show()

### Model class and helper functions

In [8]:
class Net(nn.Module):
    def __init__(self, input_size, n_neurons, num_classes):
        super(Net, self).__init__()
        self.layers = nn.ModuleList()

        n_hidden_layers = len(n_neurons)
        current_layer_neurons = input_size

        for i in range(n_hidden_layers):
            self.layers.append(nn.Linear(current_layer_neurons, n_neurons[i]))
            self.layers.append(nn.ReLU())
            current_layer_neurons = n_neurons[i]

        self.layers.append(nn.Linear(current_layer_neurons, num_classes))
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, patience, verbose):
    train_losses = []
    val_losses = []
    patience_counter = 0
    best_model_state = None
    best_val_loss = float('inf')
    n_total_steps = len(train_loader)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (images, labels) in enumerate(train_loader):
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            
            if verbose or (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item()}')

        train_loss = running_loss / n_total_steps
        train_losses.append(train_loss)

        val_loss, _ = validate_model(model, val_loader, criterion, device)
        val_losses.append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return train_losses, val_losses
    
def validate_model(model, test_loader, criterion, device):
    total_loss = 0.0
    n_samples = 0
    n_correct = 0
    
    model.eval()
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item() * labels.size(0)
            n_samples += labels.size(0)

            _, predicted = torch.max(outputs, 1)
            n_correct += (predicted == labels).sum().item()

        avg_loss = total_loss / n_samples
        accuracy = 100.0 * n_correct / n_samples
    
    return avg_loss, accuracy

def plot_losses(train_losses, val_losses, title):
    plt.figure(figsize=(10, 6))
    epochs = range(1, len(train_losses) + 1)
    
    plt.plot(epochs, train_losses, 'b-', label='Training Loss')
    plt.plot(epochs, val_losses, 'r-', label='Validation Loss')
    
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

### Model definition, training and testing (no kfold for fast evaluation)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters for the neural network
input_size = 784
n_neurons = (512, 256, 128)
num_classes = 10

# Hyperparameters for the training
num_epochs = 300
batch_size = len(train_dataset)
learning_rate = 0.001

# Hyperparameters for the Rprop optimizer
eta_minus = 0.5
eta_plus = 1.2
min_step = 1e-6
max_step = 50

# Hyperparameters for the early stopping
patience = 3

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = Net(input_size, n_neurons, num_classes).to(device)
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Rprop(
    model.parameters(), 
    lr=learning_rate, 
    etas=(eta_minus, eta_plus), 
    step_sizes=(min_step, max_step),
)

train_losses, val_losses = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, patience, verbose=True)
plot_losses(train_losses, val_losses, title='Training and Validation Loss')

_, acc = validate_model(model, test_loader, criterion, device)
print(f'Test accuracy: {acc:.2f}%')

### KFold cross validation with random search

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters for the neural network
input_size = 784
num_classes = 10

# Hyperparameters for the training
num_epochs = 300
batch_size = len(train_dataset)
learning_rate = 0.001

# Hyperparameters for the early stopping
patience = 3

# Hyperparameters for the k-fold cross-validation
k_folds = 2

# Number of random trials
n_trials = 5

# Parameter space for random search
param_space = {
    'n_neurons': [(512, 256, 128), (256, 128, 64), (128, 64, 32), (512, 256), (256, 128)],
    'eta_minus': [0.3, 0.4, 0.5, 0.6],
    'eta_plus': [1.1, 1.2, 1.3, 1.4],
    'min_step': [1e-8, 1e-7, 1e-6, 1e-5],
    'max_step': [30, 40, 50, 60]
}

best_accuracy = 0
best_params = None
all_results = []

X = full_train_dataset.data.numpy().reshape(-1, 784)
y = full_train_dataset.targets.numpy()

for trial in range(n_trials):
    params = {
        'n_neurons': random.choice(param_space['n_neurons']),
        'eta_minus': float(np.random.choice(param_space['eta_minus'])),
        'eta_plus': float(np.random.choice(param_space['eta_plus'])),
        'min_step': float(np.random.choice(param_space['min_step'])),
        'max_step': float(np.random.choice(param_space['max_step']))
    }
    
    print(f"\nTrial {trial+1}/{n_trials}")
    print(f"Testing parameters: {params}")
    
    fold_train_losses = []
    fold_val_losses = []
    fold_accuracies = []
    
    kfold = StratifiedKFold(n_splits=k_folds, shuffle=True)
    
    for fold, (train_ids, val_ids) in enumerate(kfold.split(X, y)):
        print(f'FOLD {fold+1}/{k_folds}')
        
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_ids)
        
        train_loader = torch.utils.data.DataLoader(
            full_train_dataset, 
            batch_size=batch_size,
            sampler=train_subsampler
        )
        
        val_loader = torch.utils.data.DataLoader(
            full_train_dataset,
            batch_size=batch_size,
            sampler=val_subsampler
        )
        
        model = Net(input_size, params['n_neurons'], num_classes).to(device)
        criterion = nn.CrossEntropyLoss()
        
        optimizer = torch.optim.Rprop(
            model.parameters(), 
            lr=learning_rate, 
            etas=(params['eta_minus'], params['eta_plus']), 
            step_sizes=(params['min_step'], params['max_step'])
        )
        
        train_losses, val_losses = train_model(
            model, 
            train_loader, 
            val_loader, 
            criterion,
            optimizer, 
            device, 
            num_epochs, 
            patience, 
            verbose=True
        )
        
        val_loss, accuracy = validate_model(model, val_loader, criterion, device)
        
        fold_train_losses.append(train_losses)
        fold_val_losses.append(val_losses)
        fold_accuracies.append(accuracy)
    
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    mean_train_loss = np.mean([losses[-1] for losses in fold_train_losses])
    mean_val_loss = np.mean([losses[-1] for losses in fold_val_losses])
    
    result = {
        'trial': trial + 1,
        'params': params,
        'mean_accuracy': mean_accuracy,
        'std_accuracy': std_accuracy,
        'mean_train_loss': mean_train_loss,
        'mean_val_loss': mean_val_loss
    }
    all_results.append(result)
    
    print(f'Average accuracy: {mean_accuracy:.2f}% (+- {std_accuracy:.2f}%)')
    print(f'Average train loss: {mean_train_loss:.4f}')
    print(f'Average val loss: {mean_val_loss:.4f}')
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_params = params

print('\nBEST PARAMETERS')
print('--------------')
print(f'Parameters: {best_params}')
print(f'Accuracy: {best_accuracy:.2f}%')

# Create a DataFrame with all results for better analysis
results_df = pd.DataFrame(all_results)
print('\nAll trials sorted by mean accuracy:')
print(results_df.sort_values('mean_accuracy', ascending=False))

# Train final model with best parameters
final_model = Net(input_size, best_params['n_neurons'], num_classes).to(device)

final_optimizer = torch.optim.Rprop(
    final_model.parameters(), 
    lr=learning_rate, 
    etas=(best_params['eta_minus'], best_params['eta_plus']), 
    step_sizes=(best_params['min_step'], best_params['max_step'])
)

criterion = nn.CrossEntropyLoss()

final_train_losses, final_val_losses = train_model(
    final_model, 
    train_loader, 
    val_loader, 
    criterion, 
    final_optimizer, 
    device, 
    num_epochs, 
    patience, 
    verbose=True
)

plot_losses(final_train_losses, final_val_losses, title='Training and Validation Loss')

_, acc = validate_model(final_model, test_loader, criterion, device)
print(f'Test accuracy: {acc:.2f}%')