<a href="https://colab.research.google.com/github/RoshanPAN/colab_notebooks/blob/main/01_Single_node_multi_GPU_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Single-node Multi-GPU training
- Dataset: MNIST
- Net: a simple CNN
- Task: classification / object recognition (numbers from image)

In [1]:
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.nn.functional as F

from torch.autograd import Variable

from torchvision import datasets, transforms

In [2]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
    
    def forward(self, x): 
        # print(x.size())  # torch.Size([128, 1, 28, 28])
        x = self.conv1(x)
        # print(x.size()) # torch.Size([128, 10, 24, 24])
        x = F.relu(F.max_pool2d(x, 2))
        # print(x.size()) # torch.Size([128, 10, 12, 12])
        x = self.conv2(x) 
        # print(x.size()) # torch.Size([128, 20, 8, 8])
        x = self.conv2_drop(x)
        x = F.max_pool2d(x, 2)
        # print(x.size()) # torch.Size([128, 20, 4, 4])
        x = F.relu(x)
        # print(x.size())
        x = x.view(-1, 320) # torch.Size([128, 320])
        # print(x.size())
        x = F.relu(self.fc1(x))
        # print(x.size()) # torch.Size([128, 50])
        x = F.dropout(x, training=self.training)
        # print(x.size())
        x = self.fc2(x)
        # print(x.size()) # torch.Size([128, 10])
        x = F.log_softmax(x, dim=1)
        # print(x.size()) # torch.Size([128, 10])
        return x


In [7]:
%%time

from typing import Tuple

import math



EPOCH = 20
BSZ = 128

def process_dataset() -> Tuple[torch.utils.data.DataLoader, int]:
    dataset = datasets.MNIST(
        "./data",
        train=True, # creates dataset from train-images-idx3-ubyte
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.1307,), std=(0.3081,))
        ])
    ) # 60000 x 28 x 28
    bsz = BSZ
    train_set = torch.utils.data.DataLoader(
        dataset,
        batch_size=bsz,
        shuffle=True,
    )
    return train_set, bsz


def run():
    # 1) dataset
    train_set, bsz = process_dataset()
    print(f"Dataset size: {train_set.dataset.data.size()}")
    # 2) nn
    model = Net().cuda()
    # 3) optimizer
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    
    # 4) training loop
    num_batches = math.ceil(len(train_set.dataset) / float(bsz))
    print(f"num_batches: {num_batches}")
    for epoch in range(EPOCH):
        epoch_loss = 0.0
        for data, target in train_set:
            data, target = Variable(data), Variable(target)
            data, target = Variable(data.cuda()), Variable(target.cuda())
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            epoch_loss += loss
            loss.backward()
            # TODO: add gradients averaging
            optimizer.step()
        print(f"epoch {epoch} : {epoch_loss / num_batches}")

run()    


Dataset size: torch.Size([60000, 28, 28])
num_batches: 469


  x = F.log_softmax(x)


epoch 0 : 1.416488766670227
epoch 1 : 0.539771556854248
epoch 2 : 0.41418710350990295
epoch 3 : 0.3458663821220398
epoch 4 : 0.3084315061569214
epoch 5 : 0.27663862705230713
epoch 6 : 0.26085877418518066
epoch 7 : 0.23939724266529083
epoch 8 : 0.22665226459503174
epoch 9 : 0.2206045389175415
epoch 10 : 0.2109023928642273
epoch 11 : 0.19803933799266815
epoch 12 : 0.19459813833236694
epoch 13 : 0.19019652903079987
epoch 14 : 0.18700070679187775
epoch 15 : 0.17976084351539612
epoch 16 : 0.17496217787265778
epoch 17 : 0.16623777151107788
epoch 18 : 0.16562294960021973
epoch 19 : 0.16535300016403198
CPU times: user 3min 47s, sys: 961 ms, total: 3min 48s
Wall time: 3min 49s


# Execution Result

## CPU, BSZ=128
```
Dataset size: torch.Size([60000, 28, 28])
num_batches: 469
<ipython-input-45-f95783ecad4e>:31: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.
  x = F.log_softmax(x)
epoch 0 : 1.3038874864578247
epoch 1 : 0.5251671075820923
epoch 2 : 0.3905843496322632
epoch 3 : 0.3264372646808624
epoch 4 : 0.28955695033073425
epoch 5 : 0.2631623148918152
epoch 6 : 0.24315407872200012
epoch 7 : 0.23252680897712708
epoch 8 : 0.21824924647808075
epoch 9 : 0.20591209828853607
CPU times: user 5min 32s, sys: 841 ms, total: 5min 32s
Wall time: 5min 37s
```


# GPU, BSZ=128
```
# 10 Epochs
Dataset size: torch.Size([60000, 28, 28])
num_batches: 469
<ipython-input-2-f95783ecad4e>:31: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.
  x = F.log_softmax(x)
epoch 0 : 1.235790491104126
epoch 1 : 0.4971596300601959
epoch 2 : 0.38141945004463196
epoch 3 : 0.32811951637268066
epoch 4 : 0.2910745143890381
epoch 5 : 0.26903238892555237
epoch 6 : 0.25371941924095154
epoch 7 : 0.2374497801065445
epoch 8 : 0.22684521973133087
epoch 9 : 0.2162005603313446
CPU times: user 1min 57s, sys: 2.61 s, total: 2min
Wall time: 2min 16s

# 20 Epochs
Dataset size: torch.Size([60000, 28, 28])
num_batches: 469
<ipython-input-2-f95783ecad4e>:31: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.
  x = F.log_softmax(x)
epoch 0 : 1.416488766670227
epoch 1 : 0.539771556854248
epoch 2 : 0.41418710350990295
epoch 3 : 0.3458663821220398
epoch 4 : 0.3084315061569214
epoch 5 : 0.27663862705230713
epoch 6 : 0.26085877418518066
epoch 7 : 0.23939724266529083
epoch 8 : 0.22665226459503174
epoch 9 : 0.2206045389175415
epoch 10 : 0.2109023928642273
epoch 11 : 0.19803933799266815
epoch 12 : 0.19459813833236694
epoch 13 : 0.19019652903079987
epoch 14 : 0.18700070679187775
epoch 15 : 0.17976084351539612
epoch 16 : 0.17496217787265778
epoch 17 : 0.16623777151107788
epoch 18 : 0.16562294960021973
epoch 19 : 0.16535300016403198
CPU times: user 3min 47s, sys: 961 ms, total: 3min 48s
Wall time: 3min 49s
```

# GPU, BSZ = 1024
```
Dataset size: torch.Size([60000, 28, 28])
num_batches: 59
<ipython-input-2-f95783ecad4e>:31: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.
  x = F.log_softmax(x)
epoch 0 : 2.2444987297058105
epoch 1 : 1.8585891723632812
epoch 2 : 1.267760992050171
epoch 3 : 0.9567292928695679
epoch 4 : 0.7941415309906006
epoch 5 : 0.7030096650123596
epoch 6 : 0.6458746790885925
epoch 7 : 0.5996829271316528
epoch 8 : 0.5671847462654114
epoch 9 : 0.5353052020072937
CPU times: user 1min 45s, sys: 348 ms, total: 1min 45s
Wall time: 1min 46s
```

## GPU, BSZ = 4096
```
Dataset size: torch.Size([60000, 28, 28])
num_batches: 15
<ipython-input-2-f95783ecad4e>:31: UserWarning: Implicit dimension choice for log_softmax has been deprecated. Change the call to include dim=X as an argument.
  x = F.log_softmax(x)
epoch 0 : 2.3003547191619873
epoch 1 : 2.278226375579834
epoch 2 : 2.256819486618042
epoch 3 : 2.2294697761535645
epoch 4 : 2.188817024230957
epoch 5 : 2.125926971435547
epoch 6 : 2.0263874530792236
epoch 7 : 1.8801077604293823
epoch 8 : 1.7081881761550903
epoch 9 : 1.5435491800308228
CPU times: user 1min 46s, sys: 305 ms, total: 1min 47s
Wall time: 1min 47s
```

# Some References

## MNIST dataset
Download the MNIST dataset from the following link: http://yann.lecun.com/exdb/mnist/
```
 MNIST dataset contents
    ├──  train-images-idx3-ubyte  # MNIST training images   
    ├──  train-labels-idx1-ubyte  # MNIST training tags
    ├──  t10k-images-idx3-ubyte   # MNIST test images
    └──  t10k-labels-idx1-ubyte   # MNIST test tags

```

## torch.optim / Optimizer
- [PyTorch Docs - torch.optim](https://pytorch.org/docs/stable/optim.html)


## 1D/2D/3D Conv Explained
https://stackoverflow.com/questions/42883547/intuitive-understanding-of-1d-2d-and-3d-convolutions-in-convolutional-neural-n/44628011#44628011

<!--- 
![CNN.png](https://i.stack.imgur.com/I25ty.png) 
![CNN.png](https://i.stack.imgur.com/xIdEq.png) 
![CNN.png](https://i.stack.imgur.com/HCWgp.png) 
--->