# Single-shot Detection with sgrvinod and FastAI

This is a interactive notebook where you can change things around, experiment and singling out techniques anywhere in the process

## The model components

This guide assume solid understand of convolutional concepts and computation. If this assumption does not hold, please feel free to check my other [repos](https://github.com/Sylar257/Image-Captioning-Project) (and [this](https://github.com/Sylar257/Skin-cancer-detection-with-stacking)) that contains more contents on these subjects.
Hence, in this notebook, we are going to jump right into our model and explain along the way why we need all these components.<br>
This might also be the place where you can experiment most of your modification on the archietectures. Using more powerful base-models, add in regularization layers, etc...

In [4]:
from torch import nn
from utils import *
import torch.nn.functional as F
from math import sqrt
from itertools import product as product
import torchvision

In [5]:
# specify GPU for cuda
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

In [8]:
class VGGBase(nn.Module):
    # We implement VGG-16 here for low-level feature extraction
    
    def __init__(self):
        super(VGGBase, self).__init__()
        
        # Stabdard convolutional layers in VGG16
        # We have an input size of 300 by 300
        self.conv1_1 = nn.Conv2d(3, 64, kernel_size=3, padding=1)   # stride = 1, output = (300+2-3)/1+1 = 300
        self.conv1_2 = nn.Conv2d(64,64, kernel_size=3, padding=1)   # output = 300 as before
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)          # output = (300-2)/2+1 = 150
        
        self.conv2_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1) # output = (150+2-3)/1+1 = 150
        self.conv2_2 = nn.Conv2d(128,128, kernel_size=3, padding=1) # output = (150+2-3)/1+1 = 150
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)          # output = (150-2)/2 +1  = 75
        
        self.conv3_1 = nn.Conv2d(128, 256, kernel_size=3, padding=1)# output = (75+2-3)/1+1 = 75
        self.conv3_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1)# output = (75+2-3)/1+1 = 75
        self.conv3_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1)# output = (75+2-3)/1+1 = 75
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True) # ceiling (not floor) here for even dims
        # output = ceil((75-2)/2)-1 = 38   if floor we would be getting 37 here which is an odd number
        
        self.conv4_1 = nn.Conv2d(256, 512, kernel_size=3, padding=1)# output = (38+2-3)/1+1 = 38
        self.conv4_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)# output = (38+2-3)/1+1 = 38
        self.conv4_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)# output = (38+2-3)/1+1 = 38
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)          # output = (38-2)/2 +1  = 19
        
        self.conv5_1 = nn.Conv2d(512, 512, kernel_size=3, padding=1)# output = (19+2-3)/1+1 = 19
        self.conv5_2 = nn.Conv2d(512, 512, kernel_size=3, padding=1)# output = (19+2-3)/1+1 = 19
        self.conv5_3 = nn.Conv2d(512, 512, kernel_size=3, padding=1)# output = (19+2-3)/1+1 = 19
        self.pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) # We retain the size at this step with padding and stride of 1
        # output = (19+2-3)/1+1 = 19
        
        # Here we replace the FC6 and FC7 with the technique introduce by sgrvinod(same with the original paper)
        self.conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) # output = (19+12-3-2*(6-1))/1+1 = 19
        
        self.conv7 = nn.Conv2d(1024,1024, kernel_size=1)                        # output = (19-1)/1+1 = 19
        
        # Load pretrained layers
        self.load_pretrained_layers()
        
    def forawrd(self, image):
        # forward run with an image input of size 300 by 300
        """
        image: tensor of shape (N, 3, 300, 300)
        return: lower-level feature maps conv4_3 and conv7
        """
        out = F.relu(self.conv1_1(image))  # (N,64,300,300)
        out = F.relu(self.conv1_2(out))    # (N,64,300,300)
        out = self.pool1(out)              # (N,64,150,150)
        
        out = F.relu(self.conv2_1(out))    # (N,128,150,150)
        out = F.relu(self.conv2_2(out))    # (N,128,150,150)
        out = self.pool2(out)              # (N,128, 75, 75)
        
        out = F.relu(self.conv3_1(out))    # (N,256, 75, 75)
        out = F.relu(self.conv3_2(out))    # (N,256, 75, 75)
        out = F.relu(self.conv3_3(out))    # (N,256, 75, 75)
        out = self.pool3(out)              # (N,256, 38, 38) because we haveceil_mode=True
        
        out = F.relu(self.conv4_1(out))    # (N,512, 38, 38)
        out = F.relu(self.conv4_2(out))    # (N,512, 38, 38)
        out = F.relu(self.conv4_3(out))    # (N,512, 38, 38)
        # here we extract the feature from conv4_3
        conv4_3_feats = out
        out = self.pool4(out)              # (N,512, 19, 19)
        
        out = F.relu(self.conv5_1(out))    # (N,512, 19, 19)
        out = F.relu(self.conv5_2(out))    # (N,512, 19, 19)
        out = F.relu(self.conv5_3(out))    # (N,512, 19, 19)
        out = self.pool5(out)              # (N,512, 19, 19) k=3,s=1,p=1, we are retaining the size here
        
        out = F.relu(self.conv6(out))      # (N,1024,19, 19) This is the diated convolutional layer with dilation=6, padding=6
        
        out = F.relu(self.conv7(out))      # (N,1024,19, 19)
        # also, extract feature maps of conv7 here
        conv7_feats = out
        
        return conv4_3_feats, conv7_feats

    def load_pretrained_layers(self):        
        # Use pre-trained wieght from Torch Vsion. Conver fc6 and fc7 weights into conv6 and conv7
        # current state of base architecture
        state_dict = self.state_dict()
        param_names = list(state_dict.keys())
        
        # Pretrained VGG base
        pretrained_state_dict = torchvision.models.vgg16(pretrained=True).state_dict()
        pretrained_param_names = list(pretrained_state_dict.keys())
        
        # Transfer conv, parameters from pretrained model to current model
        for i, param in enumerate(param_names[:-4]):  # excluding conv6 and conv7 parameters
            state_dict[param] = pretrained_state_dict[pretrained_param_names[i]]
            
        # Conver fc6, fc7 to convolutional layers, and subsample (by decimation) to sizes of conv6 and conv7
        # fc6
        conv_fc6_weight = pretrained_state_dict['classifier.0.weight'].view(4096, 512, 7, 7)  # (4096, 512, 7, 7)
        conv_fc6_bias = pretrained_state_dict['classifier.0.bias']  # (4096)
        state_dict['conv6.weight'] = decimate(conv_fc6_weight, m=[4, None, 3, 3])  # (1024, 512, 3, 3)
        state_dict['conv6.bias'] = decimate(conv_fc6_bias, m=[4])  # (1024)
        # fc7
        conv_fc7_weight = pretrained_state_dict['classifier.3.weight'].view(4096, 4096, 1, 1)  # (4096, 4096, 1, 1)
        conv_fc7_bias = pretrained_state_dict['classifier.3.bias']  # (4096)
        state_dict['conv7.weight'] = decimate(conv_fc7_weight, m=[4, 4, None, None])  # (1024, 1024, 1, 1)
        state_dict['conv7.bias'] = decimate(conv_fc7_bias, m=[4])  # (1024)
        
        self.load_state_dict(state_dict)
        
        print("\nLoaded base model with pre-trained weights\n")

In [10]:
class AuxiliaryConvolutions(nn.Module):
    """
    These layers are put on top of base model to produce more feature maps for object detections.(smaller maps)
    """

    def __init__(self):
        super(AuxiliaryConvolutions, self).__init__()

        self.conv8_1 = nn.Conv2d(1024, 256, kernel_size=1, padding=0)         # output=(19-1)/1+1 = 19
        self.conv8_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1)# output=(19+2-3)/2+1 = 10

        self.conv9_1 = nn.Conv2d(512, 128, kernel_size=1, padding=0)          # output=(10-1)/1+1 = 10
        self.conv9_2 = nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1)# output=(10+2-3)/2+1 = 5 because by defaul we use "floor"
        
        self.conv10_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)         # output=(5-1)/1+1 = 5
        self.conv10_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0)         # output=(5-3)/1+1 = 3
        
        self.conv11_1 = nn.Conv2d(256, 128, kernel_size=1, padding=0)         # output=(3-1)/1+1 = 3
        self.conv11_2 = nn.Conv2d(128, 256, kernel_size=3, padding=0)         # output=(3-3)/1+1 = 1
        
        self.init_conv2d()
        
    def init_conv2d(self):
        """
        Initialize convolution parameters
        """
        for c in self.children():
            if isinstance(c, nn.Conv2d):
                nn.init.kaiming_uniform_(c.weight, nonlinearity='relu')
                nn.init.constant_(c.bias, 0.) 
                
    def forward(self, conv7_feats):
        """
        conv7_feats: (N, 1024, 19, 19)
        return: higher-level feature maps conv8_2, conv9_2, conv10_2, and conv11_2
        """
        out = F.relu(self.conv8_1(conv7_feats))  # (N, 256, 19, 19)
        out = F.relu(self.conv8_2(out))  # (N, 512, 10, 10)
        conv8_2_feats = out  # (N, 512, 10, 10)

        out = F.relu(self.conv9_1(out))  # (N, 128, 10, 10)
        out = F.relu(self.conv9_2(out))  # (N, 256, 5, 5)
        conv9_2_feats = out  # (N, 256, 5, 5)

        out = F.relu(self.conv10_1(out))  # (N, 128, 5, 5)
        out = F.relu(self.conv10_2(out))  # (N, 256, 3, 3)
        conv10_2_feats = out  # (N, 256, 3, 3)

        out = F.relu(self.conv11_1(out))  # (N, 128, 3, 3)
        conv11_2_feats = F.relu(self.conv11_2(out))  # (N, 256, 1, 1)

        # Higher-level feature maps
        return conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats
    

In [11]:
class PredictionConvolutions(nn.Module):
  """
  Convolutions to predict class scores and bounding boxes using lower and higher level feature maps

  The bounding boxes (offsets (g_{c_x}, g_{c_y}, g_w, g_h) of the 8732 default priors)
  See 'cxcy_to_gcxgcy' in utils.py for encoding definition

  The class scores represent the scores of each object class in each of the 8732 hounding boxes
  A high score for 'background' = no object
  """

  def __init__(self, n_classes):
    super(PredictionConvolutions, self).__init__()

    self.n_classes = n_classes

    # Number of proior_boxes we are considering per position in each feature map
    n_boxes = {'conv4_3': 4,
                'conv7': 6,
                'conv8_2': 6,
                'conv9_2': 6,
                'conv10_2': 4,
                'conv11_2': 4}
    # 4 prior-boxes prediction convoluitions (predict offsets w.r.t prior-boxes)
    
    # This is the part to compute LOCALIZATION prediction
    self.loc_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3']*4, kernel_size=3, padding=1) # output = (38-3+2)/1+1 = 38, same padding
    self.loc_conv7   = nn.Conv2d(1024, n_boxes['conv7']*4, kernel_size=3, padding=1)  # output = (19-3+2)/1+1 = 19
    self.loc_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2']*4, kernel_size=3, padding=1) # output = (10-3+2)/1+1 = 10
    self.loc_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2']*4, kernel_size=3, padding=1) # output = (5-3+2)/1 +1 = 5
    self.loc_conv10_2= nn.Conv2d(256, n_boxes['conv10_2']*4,kernel_size=3, padding=1) # output = (3-3+2)/1 +1 = 3
    self.loc_conv11_2= nn.Conv2d(256, n_boxes['conv11_2']*4,kernel_size=3, padding=1) # output = (1-3+2)/1 +1 = 1

    # This is the part to comput CLASS prediction
    self.cl_conv4_3 = nn.Conv2d(512, n_boxes['conv4_3'] * n_classes, kernel_size=3, padding=1)
    self.cl_conv7   = nn.Conv2d(1024,n_boxes['conv7']   * n_classes, kernel_size=3, padding=1)
    self.cl_conv8_2 = nn.Conv2d(512, n_boxes['conv8_2'] * n_classes, kernel_size=3, padding=1)
    self.cl_conv9_2 = nn.Conv2d(256, n_boxes['conv9_2'] * n_classes, kernel_size=3, padding=1)
    self.cl_conv10_2 = nn.Conv2d(256,n_boxes['conv10_2'] * n_classes,kernel_size=3, padding=1)
    self.cl_conv11_2 = nn.Conv2d(256,n_boxes['conv11_2'] * n_classes,kernel_size=3, padding=1)

    self.init_conv2d()

  def init_conv2d(self):
    # Use Kaiming_uniform_ here instead of xavier_uniform_
    for c in self.children():
      if isinstance(c, nn.Conv2d):
        nn.init.kaiming_uniform_(c.weight)
        nn.init.constant_(c.bias, 0.)

  def forward(self, conv4_3_feats, conv7_feats, conv8_2_feats, conv9_2_feats, conv10_2_feats, conv11_2_feats):
    """
    Forward propagation.
    :param conv4_3_feats: conv4_3 feature map, a tensor of dimensions (N, 512, 38, 38)
    :param conv7_feats: conv7 feature map, a tensor of dimensions (N, 1024, 19, 19)
    :param conv8_2_feats: conv8_2 feature map, a tensor of dimensions (N, 512, 10, 10)
    :param conv9_2_feats: conv9_2 feature map, a tensor of dimensions (N, 256, 5, 5)
    :param conv10_2_feats: conv10_2 feature map, a tensor of dimensions (N, 256, 3, 3)
    :param conv11_2_feats: conv11_2 feature map, a tensor of dimensions (N, 256, 1, 1)
    :return: 8732 locations and class scores (i.e. w.r.t each prior box) for each image
    """
    batch_size = conv4_3_feats.size(0)

    # Predict localization boxes' bounds w.r.t prior boxes
    l_conv4_3 = self.loc_conv4_3(conv4_3_feats)            # (N, 16, 38, 38)  16 is from 4 priors 4*4=16
    l_conv4_3 = l_conv4_3.permute(0, 2, 3, 1).contiguous() # (N, 38, 38, 16)  to match prior-box order (after .view())
    # .contiguous() ensures it is stores in a contiguous chunk of memory, needed for .view() below

    l_conv4_3 = l_conv4_3.view(batch_size, -1, 4)          # This give us (N, 5776, 4) the (g_{c_x}, g_{c_y}, g_w, g_h) for all 5776 priors

    l_conv7 = self.loc_conv7(conv7_feats)  # (N, 24, 19, 19)
    l_conv7 = l_conv7.permute(0, 2, 3, 1).contiguous()  # (N, 19, 19, 24)
    l_conv7 = l_conv7.view(batch_size, -1, 4)  # (N, 2166, 4), there are a total 2116 boxes on this feature map

    l_conv8_2 = self.loc_conv8_2(conv8_2_feats)  # (N, 24, 10, 10)
    l_conv8_2 = l_conv8_2.permute(0, 2, 3, 1).contiguous()  # (N, 10, 10, 24)
    l_conv8_2 = l_conv8_2.view(batch_size, -1, 4)  # (N, 600, 4)

    l_conv9_2 = self.loc_conv9_2(conv9_2_feats)  # (N, 24, 5, 5)
    l_conv9_2 = l_conv9_2.permute(0, 2, 3, 1).contiguous()  # (N, 5, 5, 24)
    l_conv9_2 = l_conv9_2.view(batch_size, -1, 4)  # (N, 150, 4)

    l_conv10_2 = self.loc_conv10_2(conv10_2_feats)  # (N, 16, 3, 3)
    l_conv10_2 = l_conv10_2.permute(0, 2, 3, 1).contiguous()  # (N, 3, 3, 16)
    l_conv10_2 = l_conv10_2.view(batch_size, -1, 4)  # (N, 36, 4)

    l_conv11_2 = self.loc_conv11_2(conv11_2_feats)  # (N, 16, 1, 1)
    l_conv11_2 = l_conv11_2.permute(0, 2, 3, 1).contiguous()  # (N, 1, 1, 16)
    l_conv11_2 = l_conv11_2.view(batch_size, -1, 4)  # (N, 4, 4)

    # Predict classes in localization boxes
    c_conv4_3 = self.cl_conv4_3(conv4_3_feats)  # (N, 4 * n_classes, 38, 38)
    c_conv4_3 = c_conv4_3.permute(0, 2, 3,
                                  1).contiguous()  # (N, 38, 38, 4 * n_classes), to match prior-box order (after .view())
    c_conv4_3 = c_conv4_3.view(batch_size, -1,
                                self.n_classes)  # (N, 5776, n_classes), there are a total 5776 boxes on this feature map

    c_conv7 = self.cl_conv7(conv7_feats)  # (N, 6 * n_classes, 19, 19)
    c_conv7 = c_conv7.permute(0, 2, 3, 1).contiguous()  # (N, 19, 19, 6 * n_classes)
    c_conv7 = c_conv7.view(batch_size, -1,
                            self.n_classes)  # (N, 2166, n_classes), there are a total 2116 boxes on this feature map

    c_conv8_2 = self.cl_conv8_2(conv8_2_feats)  # (N, 6 * n_classes, 10, 10)
    c_conv8_2 = c_conv8_2.permute(0, 2, 3, 1).contiguous()  # (N, 10, 10, 6 * n_classes)
    c_conv8_2 = c_conv8_2.view(batch_size, -1, self.n_classes)  # (N, 600, n_classes)

    c_conv9_2 = self.cl_conv9_2(conv9_2_feats)  # (N, 6 * n_classes, 5, 5)
    c_conv9_2 = c_conv9_2.permute(0, 2, 3, 1).contiguous()  # (N, 5, 5, 6 * n_classes)
    c_conv9_2 = c_conv9_2.view(batch_size, -1, self.n_classes)  # (N, 150, n_classes)

    c_conv10_2 = self.cl_conv10_2(conv10_2_feats)  # (N, 4 * n_classes, 3, 3)
    c_conv10_2 = c_conv10_2.permute(0, 2, 3, 1).contiguous()  # (N, 3, 3, 4 * n_classes)
    c_conv10_2 = c_conv10_2.view(batch_size, -1, self.n_classes)  # (N, 36, n_classes)

    c_conv11_2 = self.cl_conv11_2(conv11_2_feats)  # (N, 4 * n_classes, 1, 1)
    c_conv11_2 = c_conv11_2.permute(0, 2, 3, 1).contiguous()  # (N, 1, 1, 4 * n_classes)
    c_conv11_2 = c_conv11_2.view(batch_size, -1, self.n_classes)  # (N, 4, n_classes)

    # A total of 8732 boxes
    # Concatenate in this specific order    
    locs = torch.cat([l_conv4_3, l_conv7, l_conv8_2, l_conv9_2, l_conv10_2, l_conv11_2], dim=1)  # (N, 8732, 4)
    classes_scores = torch.cat([c_conv4_3, c_conv7, c_conv8_2, c_conv9_2, c_conv10_2, c_conv11_2], dim=1)  # (N, 8732, n_classes)

    return locs, classes_scores