# Utilizing multiple devices for training PyTorch Model

In [1]:
import torch
print(torch.__version__)

1.10.0+cu111


## Preparing model training

Prepare necessary components for model training; we will train a simple model with two convolution layers

In [2]:
from torchvision import datasets
from torchvision.transforms import ToTensor
train_data = datasets.MNIST(
    root = 'data',
    train = True,                         
    transform = ToTensor(), 
    download = True,            
)
test_data = datasets.MNIST(
    root = 'data', 
    train = False, 
    transform = ToTensor()
)

In [3]:
from torch.utils.data import DataLoader
train_data_loader = torch.utils.data.DataLoader(train_data, 
                                          batch_size=100, 
                                          shuffle=True, 
                                          num_workers=2)
test_data_loader = torch.utils.data.DataLoader(test_data, 
                                          batch_size=100, 
                                          shuffle=True, 
                                          num_workers=2)

In [4]:
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv_layers = nn.Sequential(         
            nn.Conv2d(1,16,5,1,2),                              
            nn.ReLU(),                      
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(16, 32, 5, 1, 2),     
            nn.ReLU(),                      
            nn.MaxPool2d(2),                
        )
        # fully connected layer, output 10 classes
        self.linear_layer = nn.Linear(32 * 7 * 7, 10)
        
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)       
        output = self.linear_layer(x)
        return output

In [5]:
from torch.autograd import Variable
from timeit import default_timer as timer
from torch import optim

def train_model(model, device):
    print(f'Training model on {device}')
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = 0.01)
    model.train()

    num_epochs = 10
    start = timer()
    for epoch in range(num_epochs):
        for images, labels in train_data_loader:
            b_x = Variable(images).to(device)
            b_y = Variable(labels).to(device)
            output = model(b_x)
            loss = loss_func(output, b_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print (f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')
                
    end = timer()
    print(f'Training took \033[1m{end - start:.2f} seconds\033[0m')

In [6]:
def check_predictions(model, device):
    model.eval()
    sample = next(iter(test_data_loader))
    images, labels = sample
    images = torch.Tensor(images).to(device)
    labels = labels.to(device)
    test_output = model(images[:10])
    pred_y = torch.max(test_output, 1)[1].cpu().data.numpy().squeeze()
    print(f'Prediction number: {pred_y}')
    
    actual_number = labels[:10].cpu().data.numpy()
    print(f'Actual number: {actual_number}')

## Training on CPU

In [7]:
# We will use the cpu device
device = torch.device("cpu")
model_on_cpu = CNN()

In [8]:
train_model(model_on_cpu, device)
check_predictions(model_on_cpu, device)

Training model on cpu
Epoch [1/10], Loss: 0.0280
Epoch [2/10], Loss: 0.0349
Epoch [3/10], Loss: 0.1146
Epoch [4/10], Loss: 0.0437
Epoch [5/10], Loss: 0.0110
Epoch [6/10], Loss: 0.0355
Epoch [7/10], Loss: 0.0419
Epoch [8/10], Loss: 0.0080
Epoch [9/10], Loss: 0.1930
Epoch [10/10], Loss: 0.0417
Training took [1m55.25 seconds[0m
Prediction number: [0 8 0 3 6 1 2 0 4 5]
Actual number: [0 8 0 3 6 1 2 0 4 5]


## Training on single GPU

In [9]:
# Available GPU can be checked with `nvidia-smi` command
!nvidia-smi

Tue Aug 23 23:50:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  On   | 00000000:01:00.0  On |                  N/A |
| 39%   48C    P8    34W / 260W |    753MiB / 11019MiB |      8%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  On   | 00000000:02:00.0 Off |                  N/A |
|  0%   44C    P8    13W / 300W |      4MiB / 11019MiB |      0%      Defaul

In [10]:
model_on_gpu = CNN()

# Put the model on the first device
device = torch.device("cuda:0")
model_on_gpu.to(device)

CNN(
  (conv_layers): Sequential(
    (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (linear_layer): Linear(in_features=1568, out_features=10, bias=True)
)

In [11]:
train_model(model_on_gpu, device)
check_predictions(model_on_gpu, device)

Training model on cuda:0
Epoch [1/10], Loss: 0.0169
Epoch [2/10], Loss: 0.0597
Epoch [3/10], Loss: 0.0352
Epoch [4/10], Loss: 0.0146
Epoch [5/10], Loss: 0.0518
Epoch [6/10], Loss: 0.0302
Epoch [7/10], Loss: 0.0077
Epoch [8/10], Loss: 0.0150
Epoch [9/10], Loss: 0.0666
Epoch [10/10], Loss: 0.1359
Training took [1m23.70 seconds[0m
Prediction number: [7 2 7 3 8 9 6 3 4 2]
Actual number: [7 2 7 3 8 9 6 3 4 2]


In [12]:
check_predictions(model_on_gpu, device)

Prediction number: [9 3 9 7 2 4 1 2 8 5]
Actual number: [9 3 9 7 2 4 1 2 8 5]


## Training on multiple GPUs using data parallelism

In [13]:
torch.cuda.device_count()

2

In [14]:
import torch.nn as nn

cnn_model = CNN()

# we need to wrap around the model using nn.DataParallel
data_parallel_model = nn.DataParallel(cnn_model)

# Put the model on the first device (any device should be fine)
device = torch.device("cuda:0")
data_parallel_model.to(device)

DataParallel(
  (module): CNN(
    (conv_layers): Sequential(
      (0): Conv2d(1, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
      (1): ReLU()
      (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
      (4): ReLU()
      (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (linear_layer): Linear(in_features=1568, out_features=10, bias=True)
  )
)

In [15]:
train_model(data_parallel_model, device)
check_predictions(model_on_gpu, device)

Training model on cuda:0
Epoch [1/10], Loss: 0.1232
Epoch [2/10], Loss: 0.0888
Epoch [3/10], Loss: 0.0295
Epoch [4/10], Loss: 0.0113
Epoch [5/10], Loss: 0.0058
Epoch [6/10], Loss: 0.1088
Epoch [7/10], Loss: 0.0358
Epoch [8/10], Loss: 0.0013
Epoch [9/10], Loss: 0.0326
Epoch [10/10], Loss: 0.0398
Training took [1m25.94 seconds[0m
Prediction number: [3 9 2 1 5 5 7 5 5 2]
Actual number: [3 9 2 1 5 5 7 5 5 2]
