# **Deep learning for image analysis with Python**

#### Fernando Cervantes, Systems Analyst I, Imaging Solutions, Research IT
#### fernando.cervantes@jax.org    (slack) @fernando.cervantes

## 6 Monitoring and logging the training process

It is important to track the training process. By doing that, we can detect interesting behavior of our network, possible failures, and even *overfitting*.<br>
This also helps to save the results of different experiments performed using distinct configurations.

### 6.1 _Logging the network performance_

In [11]:
from torchvision.datasets import CIFAR100
from torchvision.transforms import ToTensor, Normalize, Compose
from torch.utils.data import DataLoader

In [12]:
cifar_transform_list = [
    ToTensor()
]

cifar_transform = Compose(cifar_transform_list)

cifar_trn_dataset = CIFAR100(root='/home/cervaf/data', download=False, train=True, transform=cifar_transform)
cifar_trn_loader = DataLoader(cifar_trn_dataset, batch_size=128, shuffle=True, pin_memory=True)

In [13]:
import sklearn.metrics
import numpy as np

In [24]:
import torch
import torch.nn as nn

class ResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels, downsample=False):
        super(ResNetBlock, self).__init__()
        
        if downsample:
            self.conv_1 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1, bias=False)
            self.downsample = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=2, padding=0, bias=False)
        else:
            self.conv_1 = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, bias=False)
            self.downsample = nn.Identity()
        
        self.bn1 = nn.BatchNorm2d(num_features=out_channels)
        
        self.conv_2 = nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, bias=False)        
        self.bn2 = nn.BatchNorm2d(num_features=out_channels)
        
        self.relu = nn.ReLU()
    
    def forward(self, x):
        identity = self.downsample(x)
        
        fx = self.relu(self.conv_1(x))
        fx = self.bn1(fx)
        
        fx = self.conv_2(fx)
        fx = self.bn2(fx)

        fx = self.relu(fx + identity)
        
        return fx

    
class ResNet(nn.Module):
    def __init__(self, in_channels, num_classes):
        super(ResNet, self).__init__()
        
        self.embedding = nn.Conv2d(in_channels=in_channels, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(num_features=64) 
        self.relu = nn.ReLU()
        # Output is 32x32x64
        
        self.layers = nn.Sequential(
            ResNetBlock(in_channels=64, out_channels=64, downsample=True),
            ResNetBlock(in_channels=64, out_channels=64, downsample=False),
            ResNetBlock(in_channels=64, out_channels=128, downsample=True),
            ResNetBlock(in_channels=128, out_channels=128, downsample=False),
            ResNetBlock(in_channels=128, out_channels=256, downsample=True),
            ResNetBlock(in_channels=256, out_channels=256, downsample=False),
            ResNetBlock(in_channels=256, out_channels=512, downsample=True),
            ResNetBlock(in_channels=512, out_channels=512, downsample=False)
        )
        # Output is 2x2x512
        
        self.avg_pool = nn.AvgPool2d(kernel_size=2, stride=2)
        # Output is 1x1x512
        
        self.fc = nn.Linear(in_features=512, out_features=num_classes, bias=True)
        # output is 1xnum_classes
        
    def forward(self, x):
        fx = self.bn(self.relu(self.embedding(x)))
        
        fx = self.layers(fx)
        
        fx = self.avg_pool(fx)
        
        logits = self.fc(fx.view(-1, 512))
        
        return logits

net = ResNet(in_channels=3, num_classes=100)

criterion = nn.CrossEntropyLoss()

net.cuda()
criterion.cuda()

CrossEntropyLoss()

In [25]:
import torch.optim as optim

optimizer = optim.Adam(
    params=net.parameters(),
    lr=1e-3
)

***
Now that we have set up our experiment, lets create a summary writer for our training stage

In [26]:
from torch.utils.tensorboard import SummaryWriter

Create a summary writter using TensorBoard

In [27]:
writer = SummaryWriter('runs/LR_0.001_BATCH_128')

In [28]:
starter = torch.cuda.Event(enable_timing=True)
ender = torch.cuda.Event(enable_timing=True)

In [29]:
trn_loss = []
trn_avg_acc = 0
net.train()

for e  in range(100):
    avg_loss = 0
    for i, (x, t) in enumerate(cifar_trn_loader):
        starter.record()
        optimizer.zero_grad()

        x = x.cuda()
        t = t.cuda()
        
        y = net(x)

        loss = criterion(y, t)

        loss.backward()
        
        avg_loss += loss.item()
        trn_loss.append(loss.item())

        optimizer.step()
        ender.record()

        torch.cuda.synchronize()
        trn_avg_acc += sklearn.metrics.accuracy_score(t.cpu().numpy(), y.cpu().argmax(dim=1).numpy(), normalize=False)
            
        e_time = starter.elapsed_time(ender)
        
        writer.add_scalar('batch time', e_time, e * len(cifar_trn_loader) + i)
        writer.add_scalar('training loss', loss.item(), e * len(cifar_trn_loader) + i)
        writer.add_scalar('training acc', sklearn.metrics.accuracy_score(t.cpu().numpy(), y.cpu().argmax(dim=1).numpy(), normalize=True), e * len(cifar_trn_loader) + i)

    avg_loss = avg_loss / len(cifar_trn_loader)
    print('[Epoch %02i] %.8f' % (e, avg_loss))
    

[Epoch 00] 3.65062169
[Epoch 01] 2.88800209
[Epoch 02] 2.36192711
[Epoch 03] 1.96028529
[Epoch 04] 1.59957976
[Epoch 05] 1.22611987
[Epoch 06] 0.87889902
[Epoch 07] 0.54227505
[Epoch 08] 0.33578738
[Epoch 09] 0.21201504
[Epoch 10] 0.20038928
[Epoch 11] 0.17722930
[Epoch 12] 0.15801338
[Epoch 13] 0.14833670
[Epoch 14] 0.12397254
[Epoch 15] 0.10777072
[Epoch 16] 0.09825277
[Epoch 17] 0.13012789
[Epoch 18] 0.11447810
[Epoch 19] 0.08842344
[Epoch 20] 0.07526941
[Epoch 21] 0.08866941
[Epoch 22] 0.09730977
[Epoch 23] 0.09207045
[Epoch 24] 0.07990088
[Epoch 25] 0.06634067
[Epoch 26] 0.07914605
[Epoch 27] 0.06461058
[Epoch 28] 0.05971114
[Epoch 29] 0.07999803
[Epoch 30] 0.06231711
[Epoch 31] 0.05414208
[Epoch 32] 0.05418844
[Epoch 33] 0.06799812
[Epoch 34] 0.07609604
[Epoch 35] 0.05467035
[Epoch 36] 0.04762390
[Epoch 37] 0.05371990
[Epoch 38] 0.05767364
[Epoch 39] 0.05375689
[Epoch 40] 0.04962738
[Epoch 41] 0.04518812
[Epoch 42] 0.04953536
[Epoch 43] 0.04910351
[Epoch 44] 0.03543905
[Epoch 45]

In [30]:
torch.save(net.state_dict(), 'resnet_100epochs_20220420.pth')