### Import the required packages

In [59]:
import torch
import torchvision
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from sklearn.model_selection import StratifiedKFold

### Select the computing device

In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Model hyperparameters

In [62]:
input_size = 784
hidden_size = 100
num_classes = 10
num_epochs = 2
batch_size = 100
leaning_rate = 0.001
eta_minus = 0.5
eta_plus = 1.2

### Load the dataset and create the data loaders 

In [86]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

X_train = torchvision.datasets.MNIST(
    root='./data', 
    train=True,
    transform=transform,
    download=True
)

X_test = torchvision.datasets.MNIST(
    root='./data', 
    train=False,
    transform=transform,
    download=True
)

train_loader = torch.utils.data.DataLoader(
    dataset=X_train,
    batch_size=batch_size,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    dataset=X_test,
    batch_size=batch_size,
    shuffle=False
)

# Used for finding the mean and std of the dataset
#print(X_train.train_data.float().mean()/255)
#print(X_train.train_data.float().std()/255)

### Check the data

In [None]:
print(f'X_train data shape: {X_train.data.shape}')
print(f'X_train targets shape: {X_train.targets.shape}')

print(f'X_test data shape: {X_test.data.shape}')
print(f'X_test targets shape: {X_test.targets.shape}')

print(f'Classes: {X_train.classes}')

examples = iter(train_loader)
samples, labels = next(examples)
print(f'Samples shape: {samples.shape}')
print(f'Labels shape: {labels.shape}')

for i in range(6):
    plt.subplot(2, 3, i+1)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(samples[i][0], cmap='gray')
    plt.xlabel(f'Label: {labels[i].item()}')

### Check the data distribution

In [None]:
y_train = X_train.targets.numpy()
y_test = X_test.targets.numpy()
    
datasets = {
    'Train': y_train,
    'Test': y_test
}
    
data = []
for set_name, y_data in datasets.items():
    unique, counts = np.unique(y_data, return_counts=True)
    for digit, count in zip(unique, counts):
        data.append({
            'Dataset': set_name,
            'Digit': digit,
            'Count': count
        })
    
df = pd.DataFrame(data)
    
plt.figure(figsize=(15, 8))
df_pivot = df.pivot(index='Digit', columns='Dataset', values='Count')
df_pivot.plot(kind='bar', width=0.8)
    
plt.title('Digit distribution in MNIST datasets')
plt.xlabel('Digit')
plt.ylabel('Count')
plt.legend(title='Dataset', loc='upper right')
plt.grid(True, alpha=0.3)
    
plt.show()

### Model class and helper functions

In [81]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out
    
def train_model(model, train_loader, criterion, optimizer, device, num_epochs):
    train_losses = []
    n_total_steps = len(train_loader)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for i, (images, labels) in enumerate(train_loader):
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item()}')

        train_loss = running_loss / n_total_steps
        train_losses.append(train_loss)
    
    return train_losses
    
def validate_model(model, test_loader, criterion, device, verbose):
    val_losses = []
    n_total_steps = len(test_loader)
    val_loss = 0.0
    
    model.eval()
    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()

        val_loss /= n_total_steps
        val_losses.append(val_loss)

        acc = 100.0 * n_correct / n_samples
        
        if verbose:
            print(f'Accuracy of the network on the test images: {acc} %')
    
    return val_losses, acc

### Definitions

In [82]:
model = Net(input_size, hidden_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Rprop(model.parameters(), lr=leaning_rate, etas=(eta_minus, eta_plus))

### Simple training and validation

In [None]:
train_model(model, train_loader, criterion, optimizer, device, num_epochs)
validate_model(model, test_loader, criterion, device, verbose=True)

### KFold cross validation

In [None]:
X = X_train.data.numpy().reshape(-1, 784)
y = X_train.targets.numpy()

k_folds = 5
kfold = StratifiedKFold(n_splits=k_folds, shuffle=True)

fold_train_losses = []
fold_val_losses = []
fold_accuracies = []

for fold, (train_ids, val_ids) in enumerate(kfold.split(X, y)):
    print(f'FOLD {fold+1}')
    
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_ids)
    
    train_loader = torch.utils.data.DataLoader(
        X_train, 
        batch_size=batch_size,
        sampler=train_subsampler
    )
    
    val_loader = torch.utils.data.DataLoader(
        X_train,
        batch_size=batch_size,
        sampler=val_subsampler
    )
    
    model = Net(input_size, hidden_size, num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Rprop(model.parameters(), lr=leaning_rate, etas=(eta_minus, eta_plus))
    
    train_losses = train_model(model, train_loader, criterion, optimizer, device, num_epochs)
    val_losses, accuracy = validate_model(model, val_loader, criterion, device, verbose=False)
    
    fold_train_losses.append(train_losses)
    fold_val_losses.append(val_losses)
    fold_accuracies.append(accuracy)
    
    print(f'Fold {fold+1} - Final Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}, Accuracy: {accuracy:.2f}%')

print('\nK-FOLD CROSS VALIDATION RESULTS')
print('--------------------------------')
print(f'Average accuracy: {np.mean(fold_accuracies):.2f}% (+- {np.std(fold_accuracies):.2f}%)')
print(f'Average training loss: {np.mean([losses[-1] for losses in fold_train_losses]):.4f}')
print(f'Average validation loss: {np.mean([losses[-1] for losses in fold_val_losses]):.4f}')