<a href="https://colab.research.google.com/github/RooshOfficial/Yolact_R/blob/main/yolact_cis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

backbone_selected_layers=[1, 2, 3]

class GhostModule(nn.Conv2d):
    def __init__(self, in_channels, out_channels, kernel_size, dw_size=3, ratio=2, stride=1,
                 padding=0, dilation=1, groups=1, bias=True):
        super(GhostModule, self).__init__(
            in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
        self.weight = None
        self.ratio = ratio
        self.dw_size = dw_size
        self.dw_dilation = (dw_size - 1) // 2
        self.init_channels = math.ceil(out_channels / ratio)
        self.new_channels = self.init_channels * (ratio - 1)

        self.conv1 = nn.Conv2d(self.in_channels, self.init_channels, kernel_size, self.stride, padding=self.padding)
        self.conv2 = nn.Conv2d(self.init_channels, self.new_channels, self.dw_size, 1, padding=int(self.dw_size/2), groups=self.init_channels)


        self.weight1 = nn.Parameter(torch.Tensor(self.init_channels, self.in_channels, kernel_size, kernel_size))
        self.bn1 = nn.BatchNorm2d(self.init_channels)
        if self.new_channels > 0:
            self.weight2 = nn.Parameter(torch.Tensor(self.new_channels, 1, self.dw_size, self.dw_size))
            self.bn2 = nn.BatchNorm2d(self.out_channels - self.init_channels)

        if bias:
            self.bias =nn.Parameter(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)
        self.reset_custome_parameters()

    def reset_custome_parameters(self):
        nn.init.kaiming_uniform_(self.weight1, a=math.sqrt(5))
        if self.new_channels > 0:
            nn.init.kaiming_uniform_(self.weight2, a=math.sqrt(5))
        if self.bias is not None:
            nn.init.constant_(self.bias, 0)

    def forward(self, input):
        x1 = self.conv1(input)
        if self.new_channels == 0:
            return x1
        x2 = self.conv2(x1)
        x2 = x2[:, :self.out_channels - self.init_channels, :, :]
        x = torch.cat([x1, x2], 1)
        return x


def conv3x3(in_planes, out_planes, stride=1, s=4, d=3):
    "3x3 convolution with padding"
    return GhostModule(in_planes, out_planes, kernel_size=3, dw_size=d, ratio=s,
                     stride=stride, padding=1, bias=False)

In [4]:
class GhostBottleneck(nn.Module):

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=nn.BatchNorm2d):
        super().__init__()
        self.conv1 = GhostModule(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(planes)
        self.conv2 = GhostModule(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = norm_layer(planes)
        self.conv3 = GhostModule(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = norm_layer(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

In [5]:
class GhostResNet(nn.Module):

    def __init__(self, layers, block= GhostBottleneck, norm_layer=nn.BatchNorm2d):
        super().__init__()

        self.num_base_layers = len(layers)
        self.layers = nn.ModuleList()
        self.channels = []
        self.norm_layer = norm_layer
        self.inplanes = 64
        # (3, 550, 550) to (64, 138, 138) stage
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm_layer(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.make_layer(block, 64, layers[0])              # stage 2
        self.make_layer(block, 128, layers[1], stride=2)   # stage 3
        self.make_layer(block, 256, layers[2], stride=2)   # stage 4
        self.make_layer(block, 512, layers[3], stride=2)   # stage 5


    def make_layer(self, block, planes, blocks, stride=1):

        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:

            downsample = nn.Sequential(nn.Conv2d(self.inplanes, planes * block.expansion,
                                                 kernel_size=1, stride=stride, bias=False),
                                       self.norm_layer(planes * block.expansion))

        layers = [block(self.inplanes, planes, stride, downsample, self.norm_layer)]

        self.inplanes = planes * block.expansion

        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, norm_layer=self.norm_layer))

        layer = nn.Sequential(*layers)

        self.channels.append(planes * block.expansion)
        self.layers.append(layer)


    def forward(self, x):
        """ Returns a list of convouts for each layer. """
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.maxpool(x)

        outs = []
        for i, layer in enumerate(self.layers):
            x = layer(x)
            outs.append(x)

        return tuple(outs)


    def init_backbone(self, path):
        """ Initializes the backbone weights for training. """
        state_dict = torch.load(path)

        keys = list(state_dict)
        for key in keys:
            if key.startswith('layer'):
                idx = int(key[5])
                new_key = 'layers.' + str(idx - 1) + key[6:]
                state_dict[new_key] = state_dict.pop(key)

        self.load_state_dict(state_dict, strict=False)

    def add_layer(self, conv_channels=1024, downsample=2, depth=1, block=GhostBottleneck):
        """ Add a downsample layer to the backbone as per what SSD does. """
        self.make_layer(block, conv_channels // block.expansion, blocks=depth, stride=downsample)



def construct_backbone(cfg_backbone = GhostResNet ):
    # resnet101 has 3, 4, 23, 3 blocks for each stage
    backbone = cfg_backbone([3, 4, 23, 3])

    selected_layers=[1, 2, 3]
    num_layers = max(selected_layers) + 1

    while len(backbone.layers) < num_layers:
        backbone.add_layer()

    return backbone

from torchvision import models
from torchsummary import summary

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = construct_backbone().to(device)
summary(model, (3, 550, 550))
input=torch.randn(1, 3, 550, 550)
backbone=construct_backbone()(input)

print('ghost-resnet output feature :', len(backbone))
print('C2 output shape : ', backbone[0].shape)
print('C3 output shape : ', backbone[1].shape)
print('C4 output shape : ', backbone[2].shape)
print('C5 output shape : ', backbone[3].shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 275, 275]           9,408
       BatchNorm2d-2         [-1, 64, 275, 275]             128
              ReLU-3         [-1, 64, 275, 275]               0
         MaxPool2d-4         [-1, 64, 138, 138]               0
            Conv2d-5         [-1, 32, 138, 138]           2,080
            Conv2d-6         [-1, 32, 138, 138]             320
       GhostModule-7         [-1, 64, 138, 138]               0
       BatchNorm2d-8         [-1, 64, 138, 138]             128
              ReLU-9         [-1, 64, 138, 138]               0
           Conv2d-10         [-1, 32, 138, 138]          18,464
           Conv2d-11         [-1, 32, 138, 138]             320
      GhostModule-12         [-1, 64, 138, 138]               0
      BatchNorm2d-13         [-1, 64, 138, 138]             128
             ReLU-14         [-1, 64, 1

In [7]:
class DFFN(nn.Module):
    def __init__(self):
        super().__init__()

        # 1x1 convolution for P5
        self.p5_conv = nn.Conv2d(2048, 256, kernel_size=1)

        # 1x1 convolutions for C4 and C3
        self.c4_conv = nn.Conv2d(1024, 256, kernel_size=1)
        self.c3_conv = nn.Conv2d(512, 256, kernel_size=1)

        # 3x3 convolutions for N3, N4, N5
        self.n3_conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.n4_conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.n5_conv = nn.Conv2d(256, 256, kernel_size=3, padding=1)

        # Downsampling layers
        self.m3_down = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        self.m4_down = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        self.n6_down = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)
        self.n7_down = nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1)

    def forward(self, c3, c4, c5):
        # Calculate P5
        p5 = self.p5_conv(c5)

        # Calculate P4
        p4_upsample = F.interpolate(p5, size=(35,35), mode='bilinear', align_corners=False)
        p4_conv = self.c4_conv(c4)
        p4 = p4_upsample + p4_conv

        # Calculate P3
        p3_upsample = F.interpolate(p4, size=(69,69), mode='bilinear', align_corners=False)
        p3_conv = self.c3_conv(c3)
        p3 = p3_upsample + p3_conv

        # Calculate N3
        n3 = self.n3_conv(p3)

        # Calculate M3 and N4
        m3 = self.m3_down(n3)
        n4 = self.n4_conv(p4 + m3)

        # Calculate M4 and N5
        m4 = self.m4_down(n4)
        n5 = self.n5_conv(p5 + m4)

        # Calculate N6 and N7
        n6 = self.n6_down(n5)
        n7 = self.n7_down(n6)

        return n3, n4, n5, n6, n7

c3 = backbone[1]
c4 = backbone[2]
c5 = backbone[3]

dffn = DFFN()
n3, n4, n5, n6, n7 = dffn(c3, c4, c5)
dffn_outs = n3, n4, n5, n6, n7

print("N3 size:", n3.shape)
print("N4 size:", n4.shape)
print("N5 size:", n5.shape)
print("N6 size:", n6.shape)
print("N7 size:", n7.shape)


N3 size: torch.Size([1, 256, 69, 69])
N4 size: torch.Size([1, 256, 35, 35])
N5 size: torch.Size([1, 256, 18, 18])
N6 size: torch.Size([1, 256, 9, 9])
N7 size: torch.Size([1, 256, 5, 5])


In [8]:
# N3 the feature map with the highest resolution

mask_proto_net = [(256, 3, {'padding': 1}), (256, 3, {'padding': 1}), (256, 3, {'padding': 1}),
                  (None, -2, {}), (256, 3, {'padding': 1}), (32, 1, {})]

class Protonet(nn.Module) :
    def __init__(self, mask_proto_net) :
        super().__init__()

        self.inplanes=256
        self.mask_proto_net = mask_proto_net
        self.conv1 = nn.Conv2d(self.inplanes, mask_proto_net[0][0], kernel_size=mask_proto_net[0][1], **mask_proto_net[0][2])
        self.conv2 = nn.Conv2d(self.inplanes, mask_proto_net[1][0], kernel_size=mask_proto_net[1][1], **mask_proto_net[1][2])
        self.conv3 = nn.Conv2d(self.inplanes, mask_proto_net[2][0], kernel_size=mask_proto_net[2][1], **mask_proto_net[2][2])
        self.conv4 = nn.Conv2d(self.inplanes, mask_proto_net[4][0], kernel_size=mask_proto_net[4][1], **mask_proto_net[4][2])
        self.conv5 = nn.Conv2d(self.inplanes, mask_proto_net[5][0], kernel_size=mask_proto_net[5][1], **mask_proto_net[5][2])
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        out = self.conv1(x)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.relu(out)
        out = F.interpolate(out, scale_factor = -self.mask_proto_net[3][1], mode='bilinear', align_corners=False, **self.mask_proto_net[3][2])
        out = self.relu(out)
        out = self.conv4(out)
        out = self.relu(out)
        out = self.conv5(out)

        return out


proto_out=Protonet(mask_proto_net)(n3)
print(proto_out)
print('-'*50)
print('Proto net output shape : ', proto_out.shape)

tensor([[[[-3.6992e-02, -3.1762e-02, -3.1715e-02,  ..., -4.9166e-02,
           -4.6628e-02, -3.9687e-02],
          [-4.1543e-02, -3.9857e-02, -4.0299e-02,  ..., -4.9766e-02,
           -4.4441e-02, -3.5860e-02],
          [-4.0858e-02, -4.2961e-02, -4.3873e-02,  ..., -5.5714e-02,
           -5.0131e-02, -3.8075e-02],
          ...,
          [-4.6255e-02, -5.0444e-02, -5.0349e-02,  ..., -4.8980e-02,
           -5.2253e-02, -4.5944e-02],
          [-4.5388e-02, -4.6656e-02, -4.6662e-02,  ..., -4.6578e-02,
           -5.0253e-02, -4.6804e-02],
          [-3.6967e-02, -3.6889e-02, -3.5172e-02,  ..., -4.1175e-02,
           -4.3050e-02, -4.6559e-02]],

         [[-2.1900e-02, -2.2436e-02, -2.5400e-02,  ..., -2.9217e-02,
           -2.6009e-02, -2.5228e-02],
          [-2.1987e-02, -2.8460e-02, -3.4016e-02,  ..., -3.7938e-02,
           -3.3104e-02, -2.6514e-02],
          [-2.6655e-02, -3.1347e-02, -3.4465e-02,  ..., -3.9826e-02,
           -3.6048e-02, -2.6599e-02],
          ...,
     

In [9]:
coef_dim=proto_out.shape[1]
num_classes=81
aspect_ratios: [1, 1 / 2, 2]
class PredictionModule(nn.Module):
    def __init__(self, in_channels, coef_dim):
        super().__init__()

        self.num_classes = 81
        self.coef_dim = coef_dim
        self.num_priors = 3            # num of anchor box for each pixel of feature map

        self.upfeature = nn.Conv2d(256, 256, kernel_size=3, padding=1)
        self.relu = nn.ReLU(inplace=True)

        out_channels = 256
        self.bbox_layer = nn.Conv2d(out_channels, self.num_priors * 4, kernel_size=3, padding=1)
        self.conf_layer = nn.Conv2d(out_channels, self.num_priors * self.num_classes, kernel_size=3, padding=1)
        self.mask_layer = nn.Conv2d(out_channels, self.num_priors * self.coef_dim, kernel_size=3, padding=1)

    def forward(self, x):
        x = self.upfeature(x)
        x = self.relu(x)
        conf = self.conf_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.num_classes)
        bbox = self.bbox_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, 4)
        coef_test = self.mask_layer(x)
        print('mask layer output shape : ', coef_test.shape)
        coef = self.mask_layer(x).permute(0, 2, 3, 1).contiguous().view(x.size(0), -1, self.coef_dim)
        # mask_layer output shape : [n, 96, 69, 69] / In order to make it's shape [n, 69*69*3, 32], use permute and contiguous.
        print('Changed shape : ', coef.shape)
        coef = torch.tanh(coef)

        return {'box': bbox, 'class': conf, 'coef': coef}
prediction_layers = nn.ModuleList()
prediction_layers.append(PredictionModule(in_channels=256, coef_dim=coef_dim))
print(prediction_layers[0](dffn_outs[0]))

predictions = {'box': [], 'class': [], 'coef': []}
for i in range(len(dffn_outs)) :
    p=prediction_layers[0](dffn_outs[i])
    for key, value in p.items() :
        predictions[key].append(value)
print(predictions.keys())

mask layer output shape :  torch.Size([1, 96, 69, 69])
Changed shape :  torch.Size([1, 14283, 32])
{'box': tensor([[[ 0.0699, -0.1327,  0.0862, -0.0083],
         [ 0.1819,  0.2032,  0.1766, -0.0538],
         [ 0.1502, -0.2661, -0.3464,  0.3481],
         ...,
         [ 0.0700,  0.1908,  0.3150, -0.0221],
         [-0.0756, -0.1402,  0.3611, -0.1429],
         [ 0.2203, -0.0656,  0.0964,  0.2485]]], grad_fn=<ViewBackward0>), 'class': tensor([[[ 0.3049, -0.0413,  0.1596,  ...,  0.2430, -0.0452,  0.0548],
         [ 0.1053,  0.0050,  0.1711,  ..., -0.2070, -0.1530, -0.0506],
         [ 0.1998,  0.0952,  0.1474,  ...,  0.3210, -0.1838, -0.3433],
         ...,
         [-0.1159, -0.1293, -0.1754,  ...,  0.0729,  0.1895, -0.2376],
         [-0.1461,  0.0587, -0.0773,  ..., -0.0358,  0.0603,  0.2492],
         [-0.0545, -0.1177, -0.0736,  ...,  0.0749,  0.1883,  0.0668]]],
       grad_fn=<ViewBackward0>), 'coef': tensor([[[-0.0513, -0.1265,  0.0539,  ...,  0.3950, -0.0687, -0.0295],
      