# Notebook to build PP-PicoDet model with basic pytorch building blocks
- https://arxiv.org/abs/2111.00902

## Note new version picodetv2
- https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/architectures/picodet.py
- LCNET backbone (https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/backbones/lcnet.py)
- LCPAN (https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/necks/lc_pan.py)
- picoheadv2 (https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/heads/pico_head.py)

In [3]:
import torch
import torch.nn as nn

# Backbone PPLCNET (https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/backbones/lcnet.py)

In [4]:
# utils 
NET_CONFIG = {
    "blocks2":
    #k, in_c, out_c, s, use_se
    [[3, 16, 32, 1, False], ],
    "blocks3": [
        [3, 32, 64, 2, False],
        [3, 64, 64, 1, False],
    ],
    "blocks4": [
        [3, 64, 128, 2, False],
        [3, 128, 128, 1, False],
    ],
    "blocks5": [
        [3, 128, 256, 2, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
        [5, 256, 256, 1, False],
    ],
    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
}



def make_divisible(v, divisor=8, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v

class ConvBNLayer(nn.Module):
    def __init__(self,
                 num_channels,
                 filter_size,
                 num_filters,
                 stride,
                 num_groups=1):
        super().__init__()
        
        self.conv = nn.Conv2d(in_channels=num_channels,
                              out_channels=num_filters,
                              kernel_size=filter_size,
                              stride=stride,
                              padding=(filter_size-1)//2,
                              groups=num_groups,
                              bias=False)
        # in inference fuse to conv
        self.bn = nn.BatchNorm2d(num_filters)
        self.hardswish = nn.Hardswish()
        
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.hardswish(x)
        return x

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.avg_pool= nn.AdaptiveAvgPool2d(1)
        self.conv1 = nn.Conv2d(in_channels=channel,
                               out_channels=channel//reduction,
                               kernel_size=1,
                               stride=1,
                               padding=0)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(in_channels=channel//reduction,
                               out_channels=channel,
                               kernel_size=1,
                               stride=1,
                               padding=0)
        self.hardsigmoid = nn.Hardsigmoid()
    def forward(self, x):
        idendity = x
        x = self.avg_pool(x)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.hardsigmoid(x)
        x = x*idendity
        return x
        
        
class DepthWiseSeparable(nn.Module):
    def __init__(self,
                 num_channels,
                 num_filters,
                 stride,
                 dw_size=3,
                 use_se=False):
        super().__init__()
        self.use_se = use_se
        self.dw_conv = ConvBNLayer(num_channels=num_channels,
                                   num_filters=num_channels,
                                   filter_size=dw_size,
                                   stride=stride,
                                   num_groups=num_channels)
        if use_se:
            self.se = SEModule(num_channels)
        self.pw_conv = ConvBNLayer(num_channels=num_channels,
                                   filter_size=1,
                                   num_filters=num_filters,
                                   stride=1)
    def forward(self, x):
        x = self.dw_conv(x)
        if self.use_se:
            x = self.se(x)
        x = self.pw_conv(x)
        return x
            
        
        

test_scale = 1
conv_bn = ConvBNLayer(num_channels=3,
                      filter_size=3,
                      num_filters=make_divisible(16*test_scale),
                      stride=2)
se_module = SEModule(make_divisible(16*test_scale))
blocks2 = nn.Sequential(*[DepthWiseSeparable(
                            num_channels=make_divisible(in_c*test_scale),
                            num_filters=make_divisible(out_c*test_scale),
                            dw_size=k,
                            stride=s,
                            use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])])

test_input = torch.randn(1, 3, 320, 320)
test_out = conv_bn(test_input)
test_out = blocks2(test_out)
print(test_out.size())

torch.Size([1, 32, 160, 160])


In [5]:
class LCNet(nn.Module):
    def __init__(self, scale=1.0, feature_maps=[3, 4, 5]):
        super().__init__()
        self.scale = scale
        self.feature_maps = feature_maps
        
        out_channels = []
        self.conv1 = ConvBNLayer(num_channels=3,
                                 filter_size=3,
                                 num_filters=make_divisible(16*scale),
                                 stride=2)
        self.blocks2 = nn.Sequential(*[DepthWiseSeparable(
                            num_channels=make_divisible(in_c*scale),
                            num_filters=make_divisible(out_c*scale),
                            dw_size=k,
                            stride=s,
                            use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])])
        self.blocks3 = nn.Sequential(*[DepthWiseSeparable(
                            num_channels=make_divisible(in_c*scale),
                            num_filters=make_divisible(out_c*scale),
                            dw_size=k,
                            stride=s,
                            use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])])
        
        out_channels.append(make_divisible(NET_CONFIG["blocks3"][-1][2]*scale))
        
        self.blocks4 = nn.Sequential(*[DepthWiseSeparable(
                            num_channels=make_divisible(in_c*scale),
                            num_filters=make_divisible(out_c*scale),
                            dw_size=k,
                            stride=s,
                            use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])])
        
        out_channels.append(make_divisible(NET_CONFIG["blocks4"][-1][2]*scale))
        
        self.blocks5 = nn.Sequential(*[DepthWiseSeparable(
                            num_channels=make_divisible(in_c*scale),
                            num_filters=make_divisible(out_c*scale),
                            dw_size=k,
                            stride=s,
                            use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])])
        
        out_channels.append(make_divisible(NET_CONFIG["blocks5"][-1][2]*scale))
        
        self.blocks6 = nn.Sequential(*[DepthWiseSeparable(
                            num_channels=make_divisible(in_c*scale),
                            num_filters=make_divisible(out_c*scale),
                            dw_size=k,
                            stride=s,
                            use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])])
        
        out_channels.append(make_divisible(NET_CONFIG["blocks6"][-1][2]*scale))
        self._out_channels = [
            ch for idx, ch in enumerate(out_channels) if idx+2 in feature_maps]
        
    def forward(self, inputs):
        x = inputs['image']
        outs = []
        
        x = self.conv1(x)
        x = self.blocks2(x)
        x = self.blocks3(x)
        outs.append(x)
        x = self.blocks4(x)
        outs.append(x)
        x = self.blocks5(x)
        outs.append(x)
        x = self.blocks6(x)
        outs.append(x)
        outs = [o for i, o in enumerate(outs) if i+2 in self.feature_maps]
        return outs

backbone = LCNet()
inp_t = torch.randn(1, 3, 320, 320)
output = backbone({'image': inp_t})
for t in output:
    print(t.size())

torch.Size([1, 128, 40, 40])
torch.Size([1, 256, 20, 20])
torch.Size([1, 512, 10, 10])


# Detector Neck LCPan: (https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/necks/lc_pan.py

In [6]:
class ConvBNLayerPAN(nn.Module):
    """
    In Paddle Paddle there is two modules
    named ConvBNLayer so we name this ConvBNLayerPAN
    to separate the two
    """
    def __init__(self,
             in_channel=96,
             out_channel=96,
             kernel_size=3,
             stride=1,
             groups=1,
             act='leaky_relu'):
        super(ConvBNLayerPAN, self).__init__()
        self.conv = nn.Conv2d(
            in_channels=in_channel,
            out_channels=out_channel,
            kernel_size=kernel_size,
            groups=groups,
            padding=(kernel_size - 1) // 2,
            stride=stride,
            bias=False)
        self.bn = nn.BatchNorm2d(out_channel)
        
        self.has_act = False
        if act:
            self.has_act = True
            
        self.act = nn.LeakyReLU()
        if act == "hard_swish":
            self.act = nn.Hardswish()

        
    def forward(self, x):
        x = self.bn(self.conv(x))
        if self.act:
            x = self.act(x)
        return x
    
class Channel_T(nn.Module):
    def __init__(self,
                 in_channels=[116, 232, 464],
                 out_channels=96,
                 act="leaky_relu"):
        super(Channel_T, self).__init__()
        self.convs = nn.ModuleList()
        for channel_count in in_channels:
            self.convs.append(ConvBNLayerPAN(channel_count, out_channels, 1, act=act))
    def forward(self, x):
        outs = [self.convs[i](x[i]) for i in range(len(x))]
        return outs
            
class DPModule(nn.Module):
    """
    Depth-wise and point-wise module.
     Args:
        in_channel (int): The input channels of this Module.
        out_channel (int): The output channels of this Module.
        kernel_size (int): The conv2d kernel size of this Module.
        stride (int): The conv2d's stride of this Module.
        act (str): The activation function of this Module,
                   Now support `leaky_relu` and `hard_swish`.
    """
    def __init__(self,
             in_channel=96,
             out_channel=96,
             kernel_size=3,
             stride=1,
             act='leaky_relu',
             use_act_in_out=True):
        super(DPModule, self).__init__()
        self.use_act = False
        if act:
            self.use_act = True
        self.use_act_in_out = use_act_in_out
        self.dwconv = nn.Conv2d(
            in_channels=in_channel,
            out_channels=out_channel,
            kernel_size=kernel_size,
            groups=out_channel,
            padding=(kernel_size - 1) // 2,
            stride=stride,
            bias=False)
        self.bn1 = nn.BatchNorm2d(out_channel)
        self.pwconv = nn.Conv2d(
            in_channels=out_channel,
            out_channels=out_channel,
            kernel_size=1,
            groups=1,
            padding=0,
            bias=False)
        self.bn2 = nn.BatchNorm2d(out_channel)
        self.act_func = nn.LeakyReLU()
        if act == "hard_swish":
            self.act_func = nn.Hardswish()
    def forward(self, x):
        x = self.bn1(self.dwconv(x))
        if self.use_act:
            x = self.act_func(x)
        x = self.bn2(self.pwconv(x))
        if self.use_act_in_out:
            x = self.act_func(x)
        return x
            
# test Channel_T
channel_t = Channel_T(act='hard_swish')
inp = [torch.randn(1, c, 255, 255) for c in [116, 232, 464]]
out = channel_t(inp)
for t in out:
    print(t.size())

# test DPModule
dp_module = DPModule()
inp = torch.randn(1, 96, 25, 25)
out = dp_module(inp)
print(out.size())

torch.Size([1, 96, 255, 255])
torch.Size([1, 96, 255, 255])
torch.Size([1, 96, 255, 255])
torch.Size([1, 96, 25, 25])


In [7]:
class LCPAN(nn.Module):
    """Path Aggregation Network with LCNet module.
    Args:
        in_channels (List[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale)
        kernel_size (int): The conv2d kernel size of this Module.
        num_features (int): Number of output features of CSPPAN module.
        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
        use_depthwise (bool): Whether to depthwise separable convolution in
            blocks. Default: True
    """
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size=5,
                 num_features=3,
                 use_depthwise=True,
                 act='hard_swish',
                 spatial_scales=[0.125, 0.0625, 0.03125]):
        super(LCPAN, self).__init__()
        self.conv_t = Channel_T(in_channels, out_channels, act=act)
        in_channels = [out_channels]*len(spatial_scales)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.spatial_scales = spatial_scales
        self.num_features = num_features
        conv_func = DPModule if use_depthwise else ConvBNLayer
        
        NET_CONFIG = {
            #k, in_c, out_c, stride, use_se
            "block1": [
                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
                [kernel_size, out_channels * 2, out_channels, 1, False],
            ],
            "block2": [
                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
                [kernel_size, out_channels * 2, out_channels, 1, False],
            ]
        }
        
        if self.num_features == 4:
            self.first_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.second_top_conv = conv_func(
                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
            self.spatial_scales.append(self.spatial_scales[-1] / 2)
        
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.top_down_blocks = nn.ModuleList()
        for idx in range(len(in_channels)-1 , 0, -1):
            self.top_down_blocks.append(nn.Sequential(*[
                DepthWiseSeparable(num_channels=in_c,
                                   num_filters=out_c,
                                   dw_size=k,
                                   stride=s,
                                   use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG['block1'])
            ]))
            
        self.downsamples = nn.ModuleList()
        self.bottom_up_blocks = nn.ModuleList()
        
        for idx in range(len(in_channels)-1):
            self.downsamples.append(conv_func(in_channels[idx],
                                              in_channels[idx],
                                              kernel_size=kernel_size,
                                              stride=2,
                                              act=act))
            self.bottom_up_blocks.append(nn.Sequential(*[
                DepthWiseSeparable(num_channels=in_c,
                                   num_filters=out_c,
                                   dw_size=k,
                                   stride=s,
                                   use_se=se) for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["block2"])
            ]))
        
    def forward(self, inputs):
        """
        Args:
            inputs (tuple[Tensor]): input features.
        Returns:
            tuple[Tensor]: CSPPAN features.
        """
        
        assert len(inputs) == len(self.in_channels)
        inputs = self.conv_t(inputs)
        
        # top-down path
        inner_outs = [inputs[-1]]
        for idx in range(len(self.in_channels)-1, 0, -1):
            feat_heigh = inner_outs[0] #(sic)
            feat_low = inputs[idx-1]
            
            upsample_feat = self.upsample(feat_heigh)
            
            inner_out = self.top_down_blocks[len(self.in_channels)-1-idx](
                torch.cat((upsample_feat, feat_low), dim=1))
            inner_outs.insert(0, inner_out)
        
        # bottom-up path
        outs = [inner_outs[0]]
        for idx in range(len(self.in_channels)-1):
            feat_low = outs[-1]
            feat_height = inner_outs[idx+1]
            downsample_feat = self.downsamples[idx](feat_low)
            out = self.bottom_up_blocks[idx](
                torch.cat((downsample_feat, feat_height), dim=1))

            outs.append(out)
        
        top_features = None
        if self.num_features == 4:
            top_features = self.first_top_conv(inputs[-1])
            top_features = top_features+self.second_top_conv(outs[-1])
            outs.append(top_features)
        return tuple(outs)
lcpan = LCPAN(in_channels=[128, 256, 512], out_channels=96)

# Test backbone plus LCPAN

In [8]:
# model config
scale = 0.75
feature_maps = [3, 4, 5]
out_channels = 96

backbone = LCNet(scale, feature_maps)
# get backbone output shape
outputs = backbone({'image': torch.randn(1, 3, 320, 320)})
in_channels = [c.size()[1] for c in outputs]

neck = LCPAN(in_channels, out_channels)
outputs = neck(outputs)
for out in outputs:
    print(out.size())

torch.Size([1, 96, 40, 40])
torch.Size([1, 96, 20, 20])
torch.Size([1, 96, 10, 10])


# PicoHeadV2: https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/heads/pico_head.py

In [39]:
import torch.nn.functional as F

class ConvNormLayer(nn.Module):
    def __init__(self,
             ch_in,
             ch_out,
             filter_size,
             stride,
             groups=1,
             norm_type='bn',
             norm_decay=0.,
             norm_groups=32,
             use_dcn=False,
             bias_on=False,
             lr_scale=1.,
             freeze_norm=False,
             skip_quant=False,
             dcn_lr_scale=2.):
        super(ConvNormLayer, self).__init__()
        assert norm_type in ['bn', 'sync_bn', 'gn', None]
        
        if not use_dcn:
            self.conv = nn.Conv2d(in_channels=ch_in,
                                  out_channels=ch_out,
                                  kernel_size=filter_size,
                                  stride=stride,
                                  padding=(filter_size-1)//2,
                                  groups=groups,
                                  bias=bias_on)
        else:
            raise NotImplmentedError
        
        if norm_type in ['bn', 'sync_bn']:
            self.norm = nn.BatchNorm2d(ch_out)
        else:
            self.norm = None
    def forward(self, inputs):
        out = self.conv(inputs)
        if self.norm is not None:
            out = self.norm(out)
        return out
    
    
class PicoSE(nn.Module):
    def __init__(self, feat_channels):
        super(PicoSE, self).__init__()
        self.fc = nn.Conv2d(feat_channels, feat_channels, 1)
        self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1)
    def forward(self, feat, avg_feat):
        weight = F.sigmoid(self.fc(avg_feat))
        out = self.conv(feat*weight)
        return out
        
    
class PicoFeat(nn.Module):
    """
    PicoFeat of PicoDet
    Args:
        feat_in (int): The channel number of input Tensor.
        feat_out (int): The channel number of output Tensor.
        num_convs (int): The convolution number of the LiteGFLFeat.
        norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
        share_cls_reg (bool): Whether to share the cls and reg output.
        act (str): The act of per layers.
        use_se (bool): Whether to use se module.
    """
    
    def __init__(self,
                 feat_in=256,
                 feat_out=96,
                 num_fpn_stride=3,
                 num_convs=2,
                 norm_type='bn',
                 share_cls_reg=False,
                 act='hard_swish',
                 use_se=False):
        super(PicoFeat, self).__init__()
        self.num_convs = num_convs
        self.norm_type = norm_type
        self.share_cls_reg = share_cls_reg
        self.act = act
        self.use_se = use_se
        self.cls_convs = nn.ModuleList()
        self.reg_convs = nn.ModuleList()
        if use_se:
            assert share_cls_reg == True, \
                'In the case of using se, share_cls_reg is not supported'
            self.se = nn.ModuleList()
        for stage_idx in range(num_fpn_stride):
            cls_subnet_convs = nn.ModuleList()
            reg_subnet_convs = nn.ModuleList()
            for i in range(self.num_convs):
                in_c = feat_in if i == 0 else feat_out
                cls_subnet_convs.append(ConvNormLayer(ch_in=in_c,
                                            ch_out=feat_out,
                                            filter_size=5,
                                            stride=1,
                                            groups=feat_out,
                                            norm_type=norm_type,
                                            bias_on=False,
                                            lr_scale=2.))
                cls_subnet_convs.append(ConvNormLayer(ch_in=in_c,
                                            ch_out=feat_out,
                                            filter_size=1,
                                            stride=1,
                                            norm_type=norm_type,
                                            bias_on=False,
                                            lr_scale=2.))
                if not self.share_cls_reg:
                    reg_subnet_convs.append(ConvNormLayer(
                                                ch_in=in_c,
                                                ch_out=feat_out,
                                                filter_size=5,
                                                stride=1,
                                                groups=feat_out,
                                                norm_type=norm_type,
                                                bias_on=False,
                                                lr_scale=2.))
                    reg_subnet_convs.append(ConvNormLayer(
                                                ch_in=in_c,
                                                ch_out=feat_out,
                                                filter_size=1,
                                                stride=1,
                                                norm_type=norm_type,
                                                bias_on=False,
                                                lr_scale=2.))
                self.cls_convs.append(cls_subnet_convs)
                self.reg_convs.append(reg_subnet_convs)
                if use_se:
                    self.se.append(PicoSE(feat_out))
                    
        if act == 'hard_swish':
            self.act_func = nn.Hardswish()
        elif act == 'leaky_relu':
            self.act_func = nn.LeakyReLU()
        
    def forward(self, fpn_feat, stage_idx):
        assert stage_idx < len(self.cls_convs)
        cls_feat = fpn_feat
        reg_feat = fpn_feat
        for i in range(len(self.cls_convs[stage_idx])):
            cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat))
            reg_feat = cls_feat
            if not self.share_cls_reg:
                reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat))
        if self.use_se:
            avg_feat = F.adaptive_avg_pool2d(cls_feat, (1,1))
            se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat))
            return cls_feat, se_feat
        return cls_feat, reg_feat
    
class Integral(nn.Module):
    """A fixed layer for calculating integral result from distribution.
    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
    P(y_i) denotes the softmax vector that represents the discrete distribution
    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
    Args:
        reg_max (int): The maximal value of the discrete set. Default: 16. You
            may want to reset it according to your new dataset or related
            settings.
    """
    
    def __init__(self, reg_max=16):
        super(Intergral, self).__init__()
        self.reg_max = reg_max
        self.register_buffer
    
feat = PicoFeat(feat_in=96,
                feat_out=96,
                num_convs=2,
                num_fpn_stride=4,
                norm_type='bn',
                share_cls_reg=True,
                use_se=True)


In [95]:
import math

eps = 1e-9

class PicoHeadV2(nn.Module):
    """
    PicoHeadV2
    Args:
        conv_feat (object): Instance of 'PicoFeat'
        num_classes (int): Number of classes
        fpn_stride (list): The stride of each FPN Layer
        prior_prob (float): Used to set the bias init for the class prediction layer
        loss_class (object): Instance of VariFocalLoss.
        loss_dfl (object): Instance of DistributionFocalLoss.
        loss_bbox (object): Instance of bbox loss.
        assigner (object): Instance of label assigner.
        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
                n QFL setting. Default: 7.
    """
    __inject__ = [
        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
        'static_assigner', 'assigner', 'nms'
    ]
    __shared__ = ['num_classes', 'eval_size']
    
    
    def __init__(self,
                 conv_feat='PicoFeatV2',
                 dgqp_module=None,
                 num_classes=80,
                 fpn_stride=[8, 16, 32],
                 prior_prob=0.01,
                 use_align_head=True,
                 loss_class='VariFocalLoss',
                 loss_dfl='DistributionFocalLoss',
                 loss_bbox='GIoULoss',
                 static_assigner_epoch=60,
                 static_assigner='ATSSAssigner',
                 assigner='TaskAlignedAssigner',
                 reg_max=16,
                 feat_in_chan=96,
                 nms=None,
                 nms_pre=1000,
                 cell_offset=0,
                 act='hard_swish',
                 grid_cell_scale=5.0,
                 eval_size=None):
            super(PicoHeadV2, self).__init__()
            
            self.conv_feat = conv_feat
            self.num_classes = num_classes
            self.fpn_stride = fpn_stride
            self.prior_prob = prior_prob
            self.loss_vfl = loss_class
            self.loss_dfl = loss_dfl
            self.loss_bbox = loss_bbox
            
            self.static_assigner_epoch = static_assigner_epoch
            self.static_assigner = static_assigner
            self.assigner = assigner
            
            
            self.reg_max = reg_max
            self.feat_in_chan = feat_in_chan
            self.nms = nms
            self.nms_pre = nms_pre
            self.cell_offset = cell_offset
            self.act = act
            self.grid_cell_scale = grid_cell_scale
            self.use_align_head = use_align_head
            self.cls_out_channels = self.num_classes
            self.eval_size = eval_size
            
            bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
            # Clear the super class initialization
            self.gfl_head_cls = None
            self.gfl_head_reg = None
            self.scales_regs = None

            #self.distribution_project = Integral(self.reg_max)
            
            self.head_cls_list = nn.ModuleList()
            self.head_reg_list = nn.ModuleList()
            self.cls_align = nn.ModuleList()
            
            for i in range(len(fpn_stride)):
                head_cls = nn.Conv2d(
                        in_channels=self.feat_in_chan,
                        out_channels=self.cls_out_channels,
                        kernel_size=1,
                        stride=1,
                        padding=0)
                self.head_cls_list.append(head_cls)
                
                head_reg =nn.Conv2d(
                        in_channels=self.feat_in_chan,
                        out_channels=4 * (self.reg_max + 1),
                        kernel_size=1,
                        stride=1,
                        padding=0)
                self.head_reg_list.append(head_reg)
                
                if self.use_align_head:
                    self.cls_align.append(DPModule(self.feat_in_chan,
                                                   1,
                                                   5,
                                                   act=self.act,
                                                   use_act_in_out=False))
                if self.eval_size:
                    self.anchor_points, self.stride_tensor = self._generate_anchors()
        
    def forward_eval(self, fpn_feats, export_post_process=True):
        if self.eval_size:
            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
        else:
            anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
        cls_score_list, box_list = [], []
        for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
            b, _, h, w = fpn_feat.shape
            # task decomposition
            conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
            cls_logit = self.head_cls_list[i](se_feat)
            reg_pred = self.head_reg_list[i](se_feat)

            # cls prediction and alignment
            if self.use_align_head:
                cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
                cls_score = (F.sigmoid(cls_logit)*cls_prob+eps).sqrt()
            else:
                cls_score = F.sigmoid(cls_logit)

            if not export_post_process:
                cls_score_list.append(cls_score.reshape([1, self.cls_out_channels, -1]).permute([0, 2, 1]))
                box_list.append(reg_pred.reshape([1, (self.reg_max+1)*4, -1]).permute([0, 2, 1]))

            else:
                l = h*w
                cls_score_out = cls_score.reshape([b, self.cls_out_channels, l])
                print(reg_pred.size())
                bbox_pred = reg_pred.permute([0, 2, 3, 1])
                bbox_pred = self.distribution_project(bbox_pred)
                bbox_pred = bbox_pred.reshape([b, l, 4])
                cls_score_list.append(cls_score_out)
                box_list.append(bbox_pred)

        if export_post_process:
            cls_score_list = torch.concat(cls_score_list, dim=-1)
            box_list = torch.concat(box_list, dim=1)
            box_list = batch_distance2bbox(anchor_points, box_list)
            box_list *= stride_tensor
        return cls_score_list, box_list
              
            
        
    def forward(self, fpn_feats, export_post_process=True):
        # only made for evaluation for now
        return self.forward_eval(fpn_feats, export_post_process)
        
    def _generate_anchors(self, feats=None):
        # just use in eval time
        anchor_points = []
        stride_tensor = []
        for i, stride in enumerate(self.fpn_stride):
            if feats is not None:
                _, _, h, w = feats[i].shape
            else:
                h = math.ceil(self.eval_size[0] / stride)
                w = math.ceil(self.eval_size[1] / stride)
            shift_x = torch.arange(end=w) + self.cell_offset
            shift_y = torch.arange(end=h) + self.cell_offset
            shift_y, shift_x = torch.meshgrid(shift_y, shift_x)
            anchor_point =torch.stack(
                    [shift_x, shift_y], axis=-1).float()
            anchor_points.append(anchor_point.reshape([-1, 2]))
            stride_tensor.append(
                torch.full(
                    [h * w, 1], stride))
        anchor_points = torch.concat(anchor_points)
        stride_tensor = torch.concat(stride_tensor)
        return anchor_points, stride_tensor
    def post_process(self, head_outs, scale_factor, export_nms=True):
        pass

feat = PicoFeat(feat_in=96,
                feat_out=96,
                num_convs=2,
                num_fpn_stride=4,
                norm_type='bn',
                share_cls_reg=True,
                use_se=True)

# test constructor works
head = PicoHeadV2(conv_feat=feat, feat_in_chan=96)

# Test End-to-End

In [94]:
# model config
scale = 0.75
feature_maps = [3, 4, 5]
out_channels = 96

backbone = LCNet(scale, feature_maps)
# get backbone output shape
outputs = backbone({'image': torch.randn(1, 3, 320, 320)})
in_channels = [c.size()[1] for c in outputs]

neck = LCPAN(in_channels, out_channels)
outputs = neck(outputs)

feat = PicoFeat(feat_in=96,
                feat_out=96,
                num_convs=2,
                num_fpn_stride=4,
                norm_type='bn',
                share_cls_reg=True,
                use_se=True)

head = PicoHeadV2(conv_feat=feat, feat_in_chan=96)

outputs = head(outputs, export_post_process=False)
cls_score, bbox_list = outputs
print(len(cls_score))
print(cls_score[0].size())
print(len(bbox_list))
print(bbox_list[0].size())

3
torch.Size([1, 1600, 80])
3
torch.Size([1, 1600, 68])


# Picodet full network: https://github.com/PaddlePaddle/PaddleDetection/blob/release/2.4/ppdet/modeling/architectures/picodet.py

In [96]:
class PicoDetV2:
    def __init__(self, backbone, neck, head):
        self.backbone = backbone
        self.neck = neck
        self.head = head
        self.export_post_process = True
        self.export_nms = True
    
    def forward(self, inputs):
        body_feats = self.backbone(inputs)
        fpn_feats = self.neck(body_feats)
        head_outs = self.head(fpn_feats, self.export_post_process)
        
        # post process
        scale_factor