# A three-layer multi-label-classification with custom NN class (CPU)

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

## Check if pytorch can use GPU

In [9]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
0
<torch.cuda.device object at 0x00000204743CDC48>
1
NVIDIA GeForce RTX 3080


## Set up training and testing dataset using MNIST

In [11]:
batch_size = 200
epochs = 10

# - First time it will download MNIST data online to `../data` path;
# - Parameter `transforms.Normalize((0.1307, ), (0.3081, ))` is some experiential normalization
#   parameters on original images which can improve the classification result. It can be skipped
#   if you don't need.
# - Distinguish training or testing dataset by setting `train=True` or `train=False`.
train_loader = torch.utils.data.DataLoader(datasets.MNIST(
    '../data',
    train=True,
    download=True,
    transform=transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])),
                                           batch_size=batch_size,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(datasets.MNIST(
    '../data',
    train=False,
    transform=transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])),
                                          batch_size=batch_size,
                                          shuffle=True)

In [12]:
# Print a sample to verify the data loader.
# Each image in MNIST dataset is a 28x28 grayscale (1-channel) image.
# Then, data.shape should be [batch_size, 1, 28, 28], while target.shape should be [batch_size].
source, target = next(iter(train_loader))
print(source.shape, target.shape)
print(source.data, target.data)

torch.Size([200, 1, 28, 28]) torch.Size([200])
tensor([[[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          ...,
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],


        [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242

## Set up MLP layers
Here we use three layers and design a inherited class.

In [13]:
class MLP(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(MLP, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(in_dim, 200),
            nn.ReLU(inplace=True),
            nn.Linear(200, 200),
            nn.ReLU(inplace=True),
            nn.Linear(200, out_dim),
            nn.ReLU(inplace=True),
        )
        # Initialize weights here
        self._init_weight('kaiming')

    # Initialization function, first checks the module type,
    # then applies the desired changes to the weights.
    def _init_weight(self, init_method):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                if init_method == 'kaiming':
                    # 'fan_in': to create weight implicitly by creating a linear layer
                    nn.init.kaiming_normal_(m.weight, mode='fan_in')
                else:
                    nn.init.normal_(m.weight, 0, 0.01)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.ones_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.model(x)
        return x


## Set up main training and testing pipeline

In [14]:
# NN, optimizer, loss type
# NOTE: here 28*28 is input dimension, while 10 is output dimension (10 labels)
net = MLP(28*28, 10)
learning_rate = 0.01
optimizer = optim.SGD(net.parameters(), lr=learning_rate)
criteon = nn.CrossEntropyLoss()

# Main loop
for epoch in range(epochs):

    # Training
    for batch_idx, (data, target) in enumerate(train_loader):
        # [b, 1, 28, 28] => [b, 28*28]
        data = data.view(-1, 28 * 28)

        # Call forward() function.
        logits = net(data)

        # NOTE: pytorch's CrossEntropyLoss() already contains softwax. So don't call it again.
        loss = criteon(logits, target)

        # Clear gradients to zero before computing them.
        optimizer.zero_grad()
        # Set up backward computation on loss
        loss.backward()
        # This will update newly computed gradients
        optimizer.step()

        # Print some loss values
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

    # Testing
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data = data.view(-1, 28 * 28)
        logits = net(data)

        # print(logits.data.shape)
        # print(type(logits.data))
        
        # .item() is to get value(s) from the loss
        test_loss += criteon(logits, target).item()

        # pred is to get all predicted labels. 'logits' should be [batch_size, 10], where 10 is number of labels.
        # - .max(1): get maximum values and indices along axis=1.
        # - .max(1)[1]: max(1) returns two items that, [0] is values of maximums, and [1] is indices of these maximum 
        #               values (along the axis 1), while the latter is exactly the predicted labels.
        # pred = logits.max(1)[1]
        # NOTE: this works the same as above.
        pred = logits.argmax(dim=1)
        
        # Sum the number of correct prediction, by comparing predicted labels (pred) with target labels (target.data).
        # - .eq(): returns a tensor with same size of pred or target.data that, its value is 1 if the two corresponding
        #          values are equal, or 0 if not.
        # - .sum(): summarize all values to get the number of correct predictions.
        correct += pred.eq(target).sum()
    
    # Print testing loss for this epoch
    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    


Test set: Average loss: 0.0043, Accuracy: 7350/10000 (74%)


Test set: Average loss: 0.0038, Accuracy: 7734/10000 (77%)


Test set: Average loss: 0.0036, Accuracy: 7943/10000 (79%)


Test set: Average loss: 0.0035, Accuracy: 8077/10000 (81%)


Test set: Average loss: 0.0034, Accuracy: 8173/10000 (82%)


Test set: Average loss: 0.0033, Accuracy: 8190/10000 (82%)


Test set: Average loss: 0.0033, Accuracy: 8235/10000 (82%)


Test set: Average loss: 0.0022, Accuracy: 8379/10000 (84%)


Test set: Average loss: 0.0021, Accuracy: 8418/10000 (84%)


Test set: Average loss: 0.0021, Accuracy: 8440/10000 (84%)

