In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
import numpy as np
torch.manual_seed(7)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EPOCHS = 38
BATCH_SIZE = 25

# Load training dataset into a single batch to compute mean and stddev.
transform = transforms.Compose([transforms.ToTensor()])
trainset = MNIST(root='./pt_data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=len(trainset), shuffle=True)
data = next(iter(trainloader))
mean = data[0].mean()
stddev = data[0].std()

# Helper function needed to standardize data when loading datasets.
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize(mean, stddev)])

trainset = MNIST(root='./pt_data', train=True, download=True, transform=transform)
testset = MNIST(root='./pt_data', train=False, download=True, transform=transform)

# Create a Sequential (feed-forward) model.
# 784 inputs.
# Two fully-connected layers with 25 and 10 neurons.
# tanh as activation function for hidden layer.
# Logistic (sigmoid) as activation function for output layer.
model = nn.Sequential(
    nn.Flatten(),
    nn.Linear(784, 25),
    nn.Tanh(),
    nn.Linear(25, 10),
    nn.Sigmoid()
)

# Initialize weights.
for module in model.modules():
    if isinstance(module, nn.Linear):
        nn.init.uniform_(module.weight, a=-0.1, b=0.1)
        nn.init.constant_(module.bias, 0.0)

# Use stochastic gradient descent (SGD) with
# learning rate of 0.01 and no other bells and whistles.
# MSE as loss function.
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
loss_function = nn.MSELoss()

# Transfer model to GPU
model.to(device)

# Create DataLoader objects that will help create mini-batches.
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

# Train the model. In PyTorch we have to implement the training loop ourselves.
for i in range(EPOCHS):
    model.train() # Set model in training mode.
    train_loss = 0.0
    train_correct = 0
    train_batches = 0
    for inputs, targets in trainloader:
        # Move data to GPU.
        one_hot_targets = nn.functional.one_hot(targets, num_classes=10).float()
        inputs, targets, one_hot_targets = inputs.to(device), targets.to(device), one_hot_targets.to(device)

        # Zero the parameter gradients.
        optimizer.zero_grad()

        # Forward pass.
        outputs = model(inputs)
        loss = loss_function(outputs, one_hot_targets)

        # Accumulate metrics.
        _, indices = torch.max(outputs.data, 1)
        train_correct += (indices == targets).sum().item()
        train_batches +=  1
        train_loss += loss.item()

        # Backward pass and update.
        loss.backward()
        optimizer.step()

    train_loss = train_loss / train_batches
    train_acc = train_correct / (train_batches * BATCH_SIZE)

    # Evaluate the model on the test dataset. Identical to loop above but without
    # weight adjustment.
    model.eval() # Set model in inference mode.
    test_loss = 0.0
    test_correct = 0
    test_batches = 0
    for inputs, targets in testloader:
        one_hot_targets = nn.functional.one_hot(targets, num_classes=10).float()
        inputs, targets, one_hot_targets = inputs.to(device), targets.to(device), one_hot_targets.to(device)
        outputs = model(inputs)
        loss = loss_function(outputs, one_hot_targets)
        _, indices = torch.max(outputs, 1)
        test_correct += (indices == targets).sum().item()
        test_batches +=  1
        test_loss += loss.item()

    test_loss = test_loss / test_batches
    test_acc = test_correct / (test_batches * BATCH_SIZE)

    print(f'Epoch {i+1}/{EPOCHS} loss: {train_loss:.4f} - acc: {train_acc:0.4f} - val_loss: {test_loss:.4f} - val_acc: {test_acc:0.4f}')

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./pt_data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 17612856.44it/s]


Extracting ./pt_data/MNIST/raw/train-images-idx3-ubyte.gz to ./pt_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./pt_data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 478602.36it/s]


Extracting ./pt_data/MNIST/raw/train-labels-idx1-ubyte.gz to ./pt_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./pt_data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 3773037.74it/s]


Extracting ./pt_data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./pt_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./pt_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 3067718.00it/s]


Extracting ./pt_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./pt_data/MNIST/raw

Epoch 1/38 loss: 0.1185 - acc: 0.2432 - val_loss: 0.0892 - val_acc: 0.3066
Epoch 2/38 loss: 0.0873 - acc: 0.3195 - val_loss: 0.0855 - val_acc: 0.3364
Epoch 3/38 loss: 0.0835 - acc: 0.3557 - val_loss: 0.0811 - val_acc: 0.3793
Epoch 4/38 loss: 0.0786 - acc: 0.4022 - val_loss: 0.0757 - val_acc: 0.4310
Epoch 5/38 loss: 0.0733 - acc: 0.4550 - val_loss: 0.0704 - val_acc: 0.4842
Epoch 6/38 loss: 0.0683 - acc: 0.5115 - val_loss: 0.0657 - val_acc: 0.5354
Epoch 7/38 loss: 0.0639 - acc: 0.5668 - val_loss: 0.0614 - val_acc: 0.5948
Epoch 8/38 loss: 0.0598 - acc: 0.6293 - val_loss: 0.0574 - val_acc: 0.6610
Epoch 9/38 loss: 0.0560 - acc: 0.6857 - val_loss: 0.0536 - val_acc: 0.7147
Epoch 10/38 loss: 0.0524 - acc: 0.7307 - val_loss: 0.0500 - val_acc: 0.7556
Epoch 11/38 loss: 0.0490 - acc: 0.7646 - val_loss: 0.0468 - val_acc: 0.7844
Epoch 12/38 loss: 0.0460 - acc: 0.7908 - val_loss: 0.0438 - val_acc: 0.8110
Epoch 13/38 loss

Observation: The batch size is a hyperparameter that defines the number of samples to work through before updating the internal model parameters.