# **Deep learning for image analysis with Python**

#### Fernando Cervantes, Systems Analyst I, Imaging Solutions, Research IT
#### fernando.cervantes@jax.org    (slack) @fernando.cervantes

## 6 Monitoring and logging the training process

It is important to track the training process. By doing that, we can detect interesting behavior of our network, possible failures, and even *overfitting*.<br>
This also helps to save the results of different experiments performed using distinct configurations.

### 6.1 _Logging the network performance_

In [None]:
from torchvision.datasets import CIFAR100
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor

cifar_data = CIFAR100(root=r'C:\Users\cervaf\Documents\Datasets',
                             download=False,
                             train=True,
                             transform=ToTensor()
                            )

cifar_loader = DataLoader(cifar_data,
                              batch_size=128,
                              shuffle=True,
                              pin_memory=True
                             )

In [1]:
import torch
import torch.nn as nn


class LeNet(nn.Module):
    def __init__(self, in_channels=1, num_classes=10):
        """
        Always call the initialization function from the nn.Module parent class.
        This way all parameters from the operations defined as members of *this* class are tracked for their optimization.
        """
        super(LeNet, self).__init__()
        
        self.conv_1 = nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5)
        self.sub_1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.conv_2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5)
        self.sub_2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.fc_1 = nn.Linear(in_features=5*5*16, out_features=120)
        self.fc_2 = nn.Linear(in_features=120, out_features=84)
        self.fc_3 = nn.Linear(in_features=84, out_features=num_classes)
        
        self.act_fn = nn.ReLU()

    def forward(self, x):
        # Apply convolution layers to extract feature maps with image context
        fx = self.act_fn(self.conv_1(x))
        fx = self.sub_1(fx)
        
        fx = self.act_fn(self.conv_2(fx))
        fx = self.sub_2(fx)
        
        # Flatten the feature maps to perform linear operations
        fx = fx.view(-1, 16*5*5)
        
        fx = self.act_fn(self.fc_1(fx))
        fx = self.act_fn(self.fc_2(fx))
        y = self.fc_3(fx)
        
        return y

net = LeNet(in_channels=3, num_classes=100)

criterion = nn.CrossEntropyLoss()

net.cuda()
criterion.cuda()

AssertionError: Torch not compiled with CUDA enabled

In [2]:
import torch.optim as optim

optimizer = optim.Adam(
    params=net.parameters(),
    lr=1e-3
)

***
Now that we have set up our experiment, lets create a summary writer for our training stage

In [26]:
from torch.utils.tensorboard import SummaryWriter

Create a summary writter using TensorBoard

In [27]:
writer = SummaryWriter('runs/LR_0.001_BATCH_128')

In [28]:
starter = torch.cuda.Event(enable_timing=True)
ender = torch.cuda.Event(enable_timing=True)

In [29]:
trn_loss = []
net.train()

for e  in range(100):
    avg_loss = 0
    for i, (x, t) in enumerate(cifar_loader):
        starter.record()
        optimizer.zero_grad()

        x = x.cuda()
        t = t.cuda()
        
        y = net(x)

        loss = criterion(y, t)

        loss.backward()
        
        avg_loss += loss.item()
        trn_loss.append(loss.item())

        optimizer.step()
        ender.record()

        torch.cuda.synchronize()

        e_time = starter.elapsed_time(ender)
        
        writer.add_scalar('batch time', e_time, e * len(cifar_loader) + i)
        writer.add_scalar('training loss', loss.item(), e * len(cifar_loader) + i)

    avg_loss = avg_loss / len(cifar_trn_loader)
    print('[Epoch %02i] %.8f' % (e, avg_loss))
    

[Epoch 00] 3.65062169
[Epoch 01] 2.88800209
[Epoch 02] 2.36192711
[Epoch 03] 1.96028529
[Epoch 04] 1.59957976
[Epoch 05] 1.22611987
[Epoch 06] 0.87889902
[Epoch 07] 0.54227505
[Epoch 08] 0.33578738
[Epoch 09] 0.21201504
[Epoch 10] 0.20038928
[Epoch 11] 0.17722930
[Epoch 12] 0.15801338
[Epoch 13] 0.14833670
[Epoch 14] 0.12397254
[Epoch 15] 0.10777072
[Epoch 16] 0.09825277
[Epoch 17] 0.13012789
[Epoch 18] 0.11447810
[Epoch 19] 0.08842344
[Epoch 20] 0.07526941
[Epoch 21] 0.08866941
[Epoch 22] 0.09730977
[Epoch 23] 0.09207045
[Epoch 24] 0.07990088
[Epoch 25] 0.06634067
[Epoch 26] 0.07914605
[Epoch 27] 0.06461058
[Epoch 28] 0.05971114
[Epoch 29] 0.07999803
[Epoch 30] 0.06231711
[Epoch 31] 0.05414208
[Epoch 32] 0.05418844
[Epoch 33] 0.06799812
[Epoch 34] 0.07609604
[Epoch 35] 0.05467035
[Epoch 36] 0.04762390
[Epoch 37] 0.05371990
[Epoch 38] 0.05767364
[Epoch 39] 0.05375689
[Epoch 40] 0.04962738
[Epoch 41] 0.04518812
[Epoch 42] 0.04953536
[Epoch 43] 0.04910351
[Epoch 44] 0.03543905
[Epoch 45]

In [30]:
torch.save(net.state_dict(), 'resnet_100epochs_20220420.pth')