<a href="https://colab.research.google.com/github/Rajitha-SL/My-AI-Projects/blob/AI-and-ML-learning/Transfer_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Define a ResNet block in PyTorch

In [14]:
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
  def __init__(self):
    super().__init__()

    # We are defining a mini network
    # that is made of two standard convolutional layers
    # with the relu in-between
    self.conv_block = nn.Sequential(
        nn.Conv2d(inp, out1, 3),
        nn.ReLU(),
        nn.Conv2d(out1, out2, 3)
        # Note that after the second layer
        # there is no activation
    )

    self.relu = nn.ReLU()

  def forward(self, x):
    # F(x)
    F = self.conv_block(x)
    # Before we apply the second activation
    # we add back the input x
    # This is the implementation of the skip connection
    H = F + x
    return self.relu(H)


We can see above that if the optimizer puts all the convolutional filters to 0, then F will be 0 and H will be equal to x

Then the block will become the identity function.

# Following is how we use squeeze and execitation block in PyTorch


In [15]:
import torch
import torch.nn as nn

class SqueezeExcitation(nn.Module):
  def __init__(self, input_channels, squeeze_channels):
    super().__init__()

    # This is the squeeze part
    # It is a Global Average Pooling GAP layer
    self.squeeze = torch.nn.AdaptiveAvgPool2d(1)

    # This is the exitation part
    # This is a perceptron with two hidden layers and with a ReLU in between
    self.excitation = nn.Sequential(
        nn.Flatten(),
        nn.Linear(input_channels, squeeze_channels),
        nn.ReLU(),
        nn.Linear(squeeze_channels, input_channels),
        nn.Sigmoid()  # Squeezes the weights between 0 and 1
    )

  # In the forward part of this network
  # We first squeeze
  def forward(self,x):
    out = self.squeeze(x)
    # This is the excitation part
    scale = self.excitation(out).unsqueeze(-1).unsqueeze(-1)
    # Then we multiply the scaling factors by the input to boost or reduce
    # the importance of the input feature maps
    out = scale * x
    return out


Getting a Pre-Trained Model with Torchvision

In [16]:
import torchvision.models

In [17]:
model = torchvision.models.resnet18(pretrained=True)
# If we set pretrained = Flase we get the model initialized with the default
# initialization, ready to be trained from scratch



In [18]:
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [19]:
# A frozen parameter is a parameter that is not allowed to vary duting training.
# In other words, backporpagation will ignore that parameter and won't change
# its value nor compute the gradient of the loss with respect to that parameter

for param in model.parameters():
  param.requires_grad = False

In [20]:
# We can also freeze the parameters of a single layer.
# For example let's think this layer is fc

for param in model.fc.parameters():
  param.requires_grad = False

In [21]:
# We can also thaw (defrost) parameters that are frozen by setting
# requires_grad = True

In [22]:
# The BatchNorm layer is a special case: it has two parameters (gamma and bete)
# But it has two buffers that are used to accumilate the mean and
# the standard deviation of the dataset during training. if we use
# requires_grad = False, then we are only fixing beta and gamma
# If we also to freeze the statistics accumilated, we need to put
# the entire layer in evaluation mode as follows

model.bn1.eval() # Apply this and see whether it improves the performance

# Note that this is different form model.eval()



BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

In [23]:
# We can invert this operation by putting the BatchNorm layer
# back into training mode

model.bn1.train()

BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)

In [24]:
# The use of print(model) is not recommended
# What is recommended is export the model and visualize it with Netron

In [25]:
# We don't need to install anything locally
# Fake image needed for torch.jit.trace
# (adjust the size of the image from 224x224 to what the network
# expects if needed)

random_image = torch.rand((1,3, 224, 224))

scripted = torch.jit.trace(model, random_image)
torch.jit.save(scripted, 'my_network.pt')

In [26]:
# If we want to freeze the parameters of the last layer which is fc
# We can do the following

for param in model.fc.parameters():
  param.requires_grad = False

# If we have 1,000 images (a small dataset) and a classification task with 10 classes, this is what we can do.

In [30]:
import torch.nn as nn
import torchvision.models

## Get a pre-trained model from torchvision,
# for example ResNet18
model = torchvision.models.resnet18(pretrained=True)

# Let's freeze all the parameters in the pre-trained network
for param in model.parameters():
  param.requires_grad = False

# Through Netron.app we discovered that the last layer is called
# "fc" (for "fully connected")
# Let's find out how many input features it has
random_image = torch.rand((1,3, 224, 224))
scripted = torch.jit.trace(model, random_image)
torch.jit.save(scripted, 'my_network_01.pt')
 # There are 512 input features

input_features = model.fc.in_features


512


In [31]:
# We have 10 classes
n_classes = 10
# Let's substitute the existing full-connected last layer with
# our own model (This will have all its parameters free to vary)
model.fc = nn.Linear(input_features, n_classes)

# Or we can use a more complicated head (This might or might not
# lead to improved performances depending on the case)
model.fc = nn.Sequential(
    nn.BatchNorm1d(input_features),
    nn.Linear(input_features, input_features * 2),
    nn.ReLU(),
    nn.BatchNorm1d(input_features * 2),
    nn.Dropout(0.5),
    nn.Linear(input_features * 2, n_classes)
)

In [32]:
# Now we can start the training process
# 1. We can start by executing the learning rate finder
# 2. Then train for a few epochs
# 3. Depending on the size of our dataset, we might be able to reach
#     a good performance rather quickly
# 4. Be careful with over fitting and do not over train
# 5. If needed add more image augmentations, weight decay and
#     other regularization techniques

# Large datasets, very different than the original dataset: Train from scratch

In [33]:
import torch.nn as nn
import torchvision.models

# Get a pre-trained model from torchvision,
# for example ResNet18
model = torchvision.models.resnet18(pretrained = False)
# We are going to use the model without the pre-trained weights



# TIMM a very useful Library for Fine-Tuning which is an alternative for torchvision

In [35]:
!pip install timm



In [37]:
# Then we can get a pre-trained model with a custom head just by doing the following
import timm
n_classes = 196
model = timm.create_model("convnext_small", pretrained = True, num_classes = n_classes)

# The library has already build a head for us, so we don't need to build one explicitly

Downloading model.safetensors:   0%|          | 0.00/201M [00:00<?, ?B/s]

In [40]:
# Through Netron.app we can see how the model is being structured
# But you don't need to do this as the head is being built
# According to our number of classe
# But I am doing this just for the curiosity :-)
random_image = torch.rand((1,3, 224, 224))
scripted = torch.jit.trace(model, random_image)
torch.jit.save(scripted, 'my_network_02.pt')

# This is quite a large model