### Import the required packages

In [87]:
import torch
import torchvision
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, random_split

### Select the computing device

In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Model hyperparameters

In [125]:
input_size = 784
hidden_size = 100
num_classes = 10
num_epochs = 10
batch_size = 100
learning_rate = 0.001
eta_minus = 0.5
eta_plus = 1.2
patience = 3

### Load the dataset and create the data loaders 

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

full_train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    transform=transform
)

test_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    transform=transform
)

train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

# Used for finding the mean and std of the dataset
#print(X_train.train_data.float().mean()/255)
#print(X_train.train_data.float().std()/255)

### Check the data

In [None]:
print(f'train_dataset data shape: {train_dataset.dataset.data[train_dataset.indices].shape}')
print(f'train_dataset targets shape: {train_dataset.dataset.targets[train_dataset.indices].shape}')

print(f'val_dataset data shape: {val_dataset.dataset.data[val_dataset.indices].shape}')
print(f'val_dataset targets shape: {val_dataset.dataset.targets[val_dataset.indices].shape}')

print(f'test_dataset data shape: {test_dataset.data.shape}')
print(f'test_datset targets shape: {test_dataset.targets.shape}')

print(f'Classes: {train_dataset.dataset.classes}')

examples = iter(train_loader)
samples, labels = next(examples)
print(f'Samples shape: {samples.shape}')
print(f'Labels shape: {labels.shape}')

for i in range(6):
    plt.subplot(2, 3, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(samples[i][0], cmap='gray')
    plt.xlabel(f'Label: {labels[i].item()}')

### Check the data distribution

In [None]:
y_train = train_dataset.dataset.targets[train_dataset.indices].numpy()
y_val = val_dataset.dataset.targets[val_dataset.indices].numpy()
y_test = test_dataset.targets.numpy()
    
datasets = {
    'Train': y_train,
    'Val': y_val,
    'Test': y_test
}
    
data = []
for set_name, y_data in datasets.items():
    unique, counts = np.unique(y_data, return_counts=True)
    for digit, count in zip(unique, counts):
        data.append({
            'Dataset': set_name,
            'Digit': digit,
            'Count': count
        })
    
df = pd.DataFrame(data)
    
plt.figure(figsize=(15, 8))
df_pivot = df.pivot(index='Digit', columns='Dataset', values='Count')
df_pivot.plot(kind='bar', width=0.8)
    
plt.title('Digit distribution in MNIST datasets')
plt.xlabel('Digit')
plt.ylabel('Count')
plt.legend(title='Dataset', loc='upper right')
plt.grid(True, alpha=0.3)
    
plt.show()

### Model class and helper functions

In [129]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, patience):
    train_losses = []
    val_losses = []
    patience_counter = 0
    best_model_state = None
    best_val_loss = float('inf')
    n_total_steps = len(train_loader)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (images, labels) in enumerate(train_loader):
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item()}')

        train_loss = running_loss / n_total_steps
        train_losses.append(train_loss)

        val_loss, _ = validate_model(model, val_loader, criterion, device)
        val_losses.append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model_state = model.state_dict().copy()
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f'Early stopping triggered at epoch {epoch+1}')
            break

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return train_losses, val_losses
    
def validate_model(model, test_loader, criterion, device):
    total_loss = 0.0
    n_samples = 0
    n_correct = 0
    
    model.eval()
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item() * labels.size(0)
            n_samples += labels.size(0)

            _, predicted = torch.max(outputs, 1)
            n_correct += (predicted == labels).sum().item()

        avg_loss = total_loss / n_samples
        accuracy = 100.0 * n_correct / n_samples
    
    return avg_loss, accuracy

def plot_losses(train_losses, val_losses, title):
    plt.figure(figsize=(10, 6))
    epochs = range(1, len(train_losses) + 1)
    
    plt.plot(epochs, train_losses, 'b-', label='Training Loss')
    plt.plot(epochs, val_losses, 'r-', label='Validation Loss')
    
    plt.title(title)
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

### Definitions

In [127]:
model = Net(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Rprop(model.parameters(), lr=learning_rate, etas=(eta_minus, eta_plus))

### Simple training and validation

In [None]:
train_losses, val_losses = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, patience)
plot_losses(train_losses, val_losses, title='Training and Validation Loss')

_, acc = validate_model(model, test_loader, criterion, device)
print(f'Test accuracy: {acc:.2f}%')

### KFold cross validation

In [None]:
X = full_train_dataset.data.numpy().reshape(-1, 784)
y = full_train_dataset.targets.numpy()

k_folds = 5
kfold = StratifiedKFold(n_splits=k_folds, shuffle=True)

fold_train_losses = []
fold_val_losses = []
fold_accuracies = []

for fold, (train_ids, val_ids) in enumerate(kfold.split(X, y)):
    print(f'FOLD {fold+1}')
    
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_ids)
    
    train_loader = torch.utils.data.DataLoader(
        full_train_dataset, 
        batch_size=batch_size,
        sampler=train_subsampler
    )
    
    val_loader = torch.utils.data.DataLoader(
        full_train_dataset,
        batch_size=batch_size,
        sampler=val_subsampler
    )
    
    model = Net(input_size, hidden_size, num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Rprop(model.parameters(), lr=learning_rate, etas=(eta_minus, eta_plus))
    
    train_losses, val_losses = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, patience)
    val_loss, accuracy = validate_model(model, val_loader, criterion, device)
    
    fold_train_losses.append(train_losses)
    fold_val_losses.append(val_losses)
    fold_accuracies.append(accuracy)
    
    print(f'Fold {fold+1} - Final Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, Accuracy: {accuracy:.2f}%')

print('\nK-FOLD CROSS VALIDATION RESULTS')
print('--------------------------------')
print(f'Average accuracy: {np.mean(fold_accuracies):.2f}% (+- {np.std(fold_accuracies):.2f}%)')
print(f'Average training loss: {np.mean([losses[-1] for losses in fold_train_losses]):.4f}')
print(f'Average validation loss: {np.mean([losses[-1] for losses in fold_val_losses]):.4f}')