# A three-layer multi-label-classification using MNIST dataset (CPU only)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

## Check if pytorch can use GPU

In [2]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
0
<torch.cuda.device object at 0x000001EEB61200C8>
1
NVIDIA GeForce RTX 3080


In [3]:
batch_size=200
epochs=10

## Set up training and testing dataset using MNIST. 
- First time it will download MNIST data online to `../data` path;
- Parameter `transforms.Normalize((0.1307, ), (0.3081, ))` is some experiential normalization parameters on original images which can improve the classification result. It can be skipped if you don't need.
- Distinguish training or testing dataset by setting `train=True` or `train=False`.

In [4]:
train_loader = torch.utils.data.DataLoader(datasets.MNIST(
    '../data',
    train=True,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])),
                                           batch_size=batch_size,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(datasets.MNIST(
    '../data',
    train=False,
    transform=transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])),
                                          batch_size=batch_size,
                                          shuffle=True)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [5]:
# Print a sample to verify the data loader.
# Each image in MNIST dataset is a 28x28 grayscale (1-channel) image.
# Then, data.shape should be [batch_size, 1, 28, 28], while target.shape should be [batch_size].
source, target = next(iter(train_loader))
print(source.shape, target.shape)
print(source.data, target.data)

torch.Size([200, 1, 28, 28]) torch.Size([200])
tensor([[[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242

## Set up MLP layers
Here we use three layers.

In [6]:
# NOTE: 
# - 784 is the input dimension, while 200 is output dimension (for the first 
# layer), according to the steps in forward() below. 
# - 10 in layer 3 is the output dimension (since there are 10 labels in MNIST dataset)
# - Remember to set requires_grad=True, so this is a variable to be optimized.
w1, b1 = torch.randn(200, 784, requires_grad=True),\
         torch.zeros(200, requires_grad=True)
w2, b2 = torch.randn(200, 200, requires_grad=True),\
         torch.zeros(200, requires_grad=True)
w3, b3 = torch.randn(10, 200, requires_grad=True),\
         torch.zeros(10, requires_grad=True)

# This is to add a special normalization method proposed by Kaiming He in his paper. 
# This is good for image classification problem that it can increase effiency. 
# If not using it, losses are easily not decreasing in the middle.
# Ref: https://pytorch.org/cppdocs/api/function_namespacetorch_1_1nn_1_1init_1ac8a913c051976a3f41f20df7d6126e57.html
torch.nn.init.kaiming_normal_(w1)
torch.nn.init.kaiming_normal_(w2)
torch.nn.init.kaiming_normal_(w3)

# forward function
def forward(x):
    # NOTE here we multiply input original matrix x by w1's transpose.
    x = x@w1.t() + b1
    x = F.relu(x)
    x = x@w2.t() + b2
    x = F.relu(x)
    x = x@w3.t() + b3
    x = F.relu(x) # this last relu can be skipped but harmless, up to you
    return x

## Set up main training and testing pipeline

In [7]:
# Optimizer (SGD) and loss type.
learning_rate = 0.01
optimizer = optim.SGD([w1, b1, w2, b2, w3, b3], lr=learning_rate)
criteon = nn.CrossEntropyLoss()

# Main loop
for epoch in range(epochs):

    # Training
    for batch_idx, (data, target) in enumerate(train_loader):
        # [b, 1, 28, 28] => [b, 28*28]
        data = data.view(-1, 28 * 28)

        # Call forward() function.
        logits = forward(data)

        # NOTE: pytorch's CrossEntropyLoss() already contains softwax. So don't call it again.
        loss = criteon(logits, target)

        # Clear gradients to zero before computing them.
        optimizer.zero_grad()
        # Set up backward computation on loss
        loss.backward()
        # This will update newly computed gradients
        optimizer.step()

        # Print some loss values
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

    # Testing
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data = data.view(-1, 28 * 28)
        logits = forward(data)

        # print(logits.data.shape)
        # print(type(logits.data))
        
        # .item() is to get value(s) from the loss
        test_loss += criteon(logits, target).item()

        # pred is to get all predicted labels.
        # - .data: get data from tensor. 'logits.data' should be [batch_size, 10], where 10 is number of labels.
        # - .max(1): get maximum values and indices along axis=1.
        # - .max(1)[1]: max(1) returns two items: values of maximums, and indices of these maximum values (along 
        # that axis), which is exactly the predicted labels.
        pred = logits.data.max(1)[1]
        
        # Sum the number of correct prediction, by comparing predicted labels (pred) with target labels (target.data).
        # - .eq(): returns a tensor with same size of pred or target.data that, its value is 1 if the two corresponding
        #          values are equal, or 0 if not.
        # - .sum(): summarize all values to get the number of correct predictions.
        correct += pred.eq(target.data).sum()
    
    # Print testing loss for this epoch
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    


Test set: Average loss: 0.0029, Accuracy: 8124/10000 (81%)


Test set: Average loss: 0.0025, Accuracy: 8326/10000 (83%)


Test set: Average loss: 0.0023, Accuracy: 8425/10000 (84%)


Test set: Average loss: 0.0022, Accuracy: 8490/10000 (85%)


Test set: Average loss: 0.0021, Accuracy: 8521/10000 (85%)


Test set: Average loss: 0.0021, Accuracy: 8557/10000 (86%)


Test set: Average loss: 0.0020, Accuracy: 8577/10000 (86%)


Test set: Average loss: 0.0020, Accuracy: 8595/10000 (86%)


Test set: Average loss: 0.0019, Accuracy: 8614/10000 (86%)


Test set: Average loss: 0.0019, Accuracy: 8626/10000 (86%)

