# In this part, we will modify our CNN architecture whether it is ResNet, VGG, ..etc to cope with the new architecture.

Architectures usually have some constrains like the input image for example should be 224*224 and so on, so we will have to construct our own models by modifying the main CNN architecture. Mainly, we will modify two main part, the train part which is responsible for training our weights and the RGB Difference model to build more channels for more concatenated images.

First, let's import some important libraries.

In [9]:
import torchvision
from torch.nn.init import normal, constant
from torch import nn
import numpy as np
import torch
#from ops.basic_ops import ConsensusModule

Now, we will build our TSN class which almost has everything ready to get started. We will prepare our base model (vgg, resnet, ..etc) and edit its last layer with our number of classes for actions (e.g. 101 for UCF101 dataset).

In [2]:
class TSN_model(nn.Module):                                             #nn.Moudle is a base class, the model class should subclass this one
  
    def __init__ (self, num_classes, num_segments, modality, consensus_type='avg', base_model_name='resnet18',
                 new_length=None, before_softmax=True, dropout=0.8, crop_num=1, partial_bn=True):
    
        super(TSN_model, self).__init__()                                               #Excute all nn.Moudle __init__ fuction stuff before anything as a base class.

        self.num_classes = num_classes
        self.num_segments = num_segments
        self.modality = modality
        self.base_model_name = base_model_name
        self.consensus_type = consensus_type
        self.before_softmax = before_softmax                                              ####
        self.dropout = dropout
        self.crop_num = crop_num                                                    ####
        self.reshape = True                                                        ####
        self.partial_bn = partial_bn

        if not before_softmax and consensus_type != 'avg':                                 ####
            raise ValueError("Only avg consensus can be used after Softmax")

        if new_length is None:                                               #Setting the number of frames picked from each segments 
            self.new_length = 1 if modality == "RGB" else 5
        else:
            self.new_length = new_length
        
        self.prepare_model(base_model_name, self.num_classes)
       # self.consensus = ConsensusModule(consensus_type)                    #Creating Consensus layer (Only 'avg' and 'identity' is available)

        print(("""
                Initializing TSN with base model: {}.
                TSN Configurations:
                    input_modality:     {}
                    num_segments:       {}
                    new_length:         {}
                    consensus_module:   {}
                    dropout_ratio:      {}
               """.format(base_model_name, self.modality, self.num_segments, self.new_length, self.consensus_type, self.dropout)))


        if self.modality == 'RGBDiff':
            print("Converting the ImageNet model to RGBDiff model")
            self.base_model = self.construct_diff_model(self.base_model)
            print("Done. RGBDiff model is ready.")


        if not self.before_softmax:                                         #Creating softmax Layer if necessary
            self.softmax = nn.Softmax()
            
    #this function is used to modify the last layer (fully connected layer) for a given architecture to suit our dataset number of actions
    def prepare_model(self, base_model_name, num_classes):
        """
        base_model: string contains the model name 
        This function get the base model from torchvision pretrained models and set some variables according to the input model name
        """
        #add other architectures later
        if 'resnet' in base_model_name:
            self.base_model = getattr(torchvision.models, base_model_name)(pretrained=True)   #Load pretrained model
            self.last_layer_name = 'fc'
            self.input_size = 224                                                                #set the input size for the model
            self.input_mean = [0.485, 0.456, 0.406]                                              #set 3 channel mean values (standard values) for normalization
            self.input_std = [0.229, 0.224, 0.225]                                               #set 3 chaneel standard deviation values for normalization

            #There's no point of substarct means from RGBDiff frames
            if self.modality == 'RGBDiff':
                self.input_mean = [0.485, 0.456, 0.406] + [0] * 3 * self.new_length                      #[0.485, 0.456, 0.406 , 0, 0, 0, 0, 0,.....]
                self.input_std = self.input_std + [np.mean(self.input_std) * 2] * 3 * self.new_length    #Expand the list with the average 0.452   

        else:
            raise ValueError('Unknown base model: {}'.format(base_model_name))

        features_dim = getattr(self.base_model, self.last_layer_name).in_features

        if self.dropout == 0:
            setattr(self.base_model, self.last_layer_name, nn.Linear(features_dim, num_classes))
            self.new_fc = getattr(self.base_model, self.last_layer_name)

        else:
            setattr(self.base_model, self.last_layer_name, nn.Dropout(self.dropout))
            self.new_fc = nn.Linear(features_dim, num_classes)
        
        print(self.new_fc)

        std=0.001
        normal(self.new_fc.weight, 0, std)
        constant(self.new_fc.bias,0)
        
        print(self.new_fc.bias)
        #what is normal and constant used for ??????

Now, let's try creating an object and manipulate it to make sure it is working.

In [3]:
obj = TSN_model(num_classes=101, num_segments=3, modality='RGBDiff')

Linear(in_features=512, out_features=101, bias=True)
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)

                Initializing TSN with base model: resnet18.
                TSN Configurations:
                    input_modality:     RGBDiff
                    num_segments:       3
                    new_length:         5
                    consensus_module:   avg
                    dropout_ratio:      0.8

               




In [4]:
obj.base_model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace)
      (conv2): Co

The next thing to be modified is the train section. We should set Batch Normalization layer to ve freezed except the first one for smooth training.
For any more details, please refer to this paper: https://arxiv.org/pdf/1502.03167.pdf

In [6]:
def train(self, mode=True):
    """
    this function freezes batch normalization layers except the first one.
    inputs: mode (True for the training process)
    """
    super(TSN, self).train(mode)
    count=0
    
    #check if partial batch normalization is activated
    if self.partial_bn:
        for m in self.base_model.modules():
            if isinstance(m, nnBatchNorm2d):
                #freeze the layers except the first one
                if not self.partial_bn or count > 0:
                    m.eval()  
                    m.weight.requires_grad = False
                    m.bias.requires_grad = False
                else:
                    count=+1
                    continue

Let's define a function for subtracting our frames to obtain our RGB difference model. the length of frames for each segment is usually set to be 5 (you can change it but this gave the best accuracy in the paper).

In [7]:
 def extract_rgbDiff(self,RGB_tensor,keep_rgb=False):
    """
    RGB_tensor : Tensor contian all frames picked from 1 Video --Size(Number of frames,3,H,W)
    keep_rgb   : Boolean True(Keep an RGB frame [RGB, RGBDiff, RGBDiff, RGBDiff....])
                        False(All frames are RGBDiff)
    """
    #Reshape the tensor to (1 , Num of segments , Number of picked frames , Channels , Hight , Width)
    RGB_tensor = RGB_tensor.view((-1 , self.num_segments , self.new_length+1 , 3 ) + RGB_tensor.size()[2:])

    if keep_rgb:
        RGBDiff_tensor= RGB_tensor.clone()
    else:
        RGBDiff_tensor = RGB_tensor[:, :, 1:, :, :, :].clone()

    #Generate RGBDiff frames
    #if keep_rgb is set to True, then we will use two streams, one for RGB and one for RGB Diff, so we have to leave the first frame
    #in RGB_tensor non-subtracted
    for x in reversed(list(range(1, self.new_length + 1))):
        if keep_rgb:
            RGBDiff_tensor[:, :, x, :, :, :]     = RGB_tensor[:, :, x, :, :, :] - RGB_tensor[:, :, x - 1, :, :, :]
        else:
            RGBDiff_tensor[:, :, x - 1, :, :, :] = RGB_tensor[:, :, x, :, :, :] - RGB_tensor[:, :, x - 1, :, :, :]

    return RGBDiff_tensor

Everything is alright up till now. The one last thing to do is to update our first conv2d layer with the appropriate number of channels to be suited for the number of frames for each segment which will go through the CNN architecture.

In [10]:
def modify_rgbDiff(self, base_model, keep_rgb=True):
    
    modules = list(self.base_model.modules())
    
    #check the index for the first conv2d layer
    for i in range(len(modules)):
        if isinstance(modules[i], nn.Conv2d):
            first_conv_idx = i
            break
        
    conv_layer = modules[first_conv_idx]
    container = modules[first_conv_idx-1]
    
    params = [x.clone() for x in conv_layer.parameters()]
    kernel_size = params[0].size()
    
    #to be continued after final exams ----------------------
    if not keep_rgb:
        new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
        new_kernels = params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()
    else:
        new_kernel_size = kernel_size[:1] + (3 * self.new_length,) + kernel_size[2:]
        new_kernels = torch.cat((params[0].data, 
                                 params[0].data.mean(dim=1, keepdim=True).expand(new_kernel_size).contiguous()),1)
        new_kernel_size = new_kernels.size()
        
    new_conv = nn.Conv2d(new_kernel_size[1], conv_layer.out_channels, conv_layer.kernel_size,
                         conv_layer.stride, conv_layer.padding, bias=True if len(params)==2 else False)
    
    new_conv.weight.data = new_kernels
    if len(params) == 2:
        new_conv.bias.data = params[1].data  # add bias if neccessary
    layer_name = list(container.state_dict().keys())[0][:-7]  # remove .weight suffix to get the layer name
    
    # replace the first convolution layer
    setattr(container, layer_name, new_conv)
    return base_model