In [11]:
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from mmcv.ops import DeformConv2dPack 
from mmcv.runner import BaseModule, auto_fp16
from timm.models.layers import PatchEmbed

# from ..builder import NECKS


# @NECKS.register_module()
class MyNeck(BaseModule):
    r"""Feature Pyramid Network.

    This is an implementation of paper `Feature Pyramid Networks for Object
    Detection <https://arxiv.org/abs/1612.03144>`_.

    Args:
        in_channels (list[int]): Number of input channels per scale.
        out_channels (int): Number of output channels (used at each scale).
        num_outs (int): Number of output scales.
        start_level (int): Index of the start input backbone level used to
            build the feature pyramid. Default: 0.
        end_level (int): Index of the end input backbone level (exclusive) to
            build the feature pyramid. Default: -1, which means the last level.
        add_extra_convs (bool | str): If bool, it decides whether to add conv
            layers on top of the original feature maps. Default to False.
            If True, it is equivalent to `add_extra_convs='on_input'`.
            If str, it specifies the source feature map of the extra convs.
            Only the following options are allowed

            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
            - 'on_lateral': Last feature map after lateral convs.
            - 'on_output': The last output feature map after fpn convs.
        relu_before_extra_convs (bool): Whether to apply relu before the extra
            conv. Default: False.
        no_norm_on_lateral (bool): Whether to apply norm on lateral.
            Default: False.
        conv_cfg (dict): Config dict for convolution layer. Default: None.
        norm_cfg (dict): Config dict for normalization layer. Default: None.
        act_cfg (dict): Config dict for activation layer in ConvModule.
            Default: None.
        upsample_cfg (dict): Config dict for interpolate layer.
            Default: dict(mode='nearest').
        init_cfg (dict or list[dict], optional): Initialization config dict.

    Example:
        >>> import torch
        >>> in_channels = [2, 3, 5, 7]
        >>> scales = [340, 170, 84, 43]
        >>> inputs = [torch.rand(1, c, s, s)
        ...           for c, s in zip(in_channels, scales)]
        >>> self = MyNeck(in_channels, 11, len(in_channels)).eval()
        >>> outputs = self.forward(inputs)
        >>> for i in range(len(outputs)):
        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
        outputs[0].shape = torch.Size([1, 11, 340, 340])
        outputs[1].shape = torch.Size([1, 11, 170, 170])
        outputs[2].shape = torch.Size([1, 11, 84, 84])
        outputs[3].shape = torch.Size([1, 11, 43, 43])
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 num_outs,
                 start_level=0,
                 end_level=-1,
                 add_extra_convs=False,
                 relu_before_extra_convs=False,
                 no_norm_on_lateral=False,
                 conv_cfg=None,
                 norm_cfg=None,
                 act_cfg=None,
                 upsample_cfg=dict(mode='nearest'),
                 init_cfg=dict(
                     type='Xavier', layer='Conv2d', distribution='uniform')):
        super(MyNeck, self).__init__(init_cfg)
        assert isinstance(in_channels, list)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_ins = len(in_channels)
        self.num_outs = num_outs
        self.relu_before_extra_convs = relu_before_extra_convs
        self.no_norm_on_lateral = no_norm_on_lateral
        self.fp16_enabled = False
        self.upsample_cfg = upsample_cfg.copy()

        if end_level == -1 or end_level == self.num_ins - 1:
            self.backbone_end_level = self.num_ins
            assert num_outs >= self.num_ins - start_level
        else:
            # if end_level is not the last level, no extra level is allowed
            self.backbone_end_level = end_level + 1
            assert end_level < self.num_ins
            assert num_outs == end_level - start_level + 1
        self.start_level = start_level
        self.end_level = end_level
        self.add_extra_convs = add_extra_convs
        assert isinstance(add_extra_convs, (str, bool))
        if isinstance(add_extra_convs, str):
            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
        elif add_extra_convs:  # True
            self.add_extra_convs = 'on_input'

        self.lateral_convs = nn.ModuleList()
        self.fpn_convs = nn.ModuleList()

        for i in range(self.start_level, self.backbone_end_level):
            # l_conv = ConvModule(
            #     in_channels[i],
            #     out_channels,
            #     1,
            #     conv_cfg=conv_cfg,
            #     norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
            #     act_cfg=act_cfg,
            #     inplace=False)
            l_conv = DeformConv2dPack(in_channels[i], out_channels, 1)
            # fpn_conv = ConvModule(
            #     out_channels,
            #     out_channels,
            #     3,
            #     padding=1,
            #     conv_cfg=conv_cfg,
            #     norm_cfg=norm_cfg,
            #     act_cfg=act_cfg,
            #     inplace=False)
            # fpn_conv= PatchEmbed(img_size=340, patch_size=2, embed_dim=in_channels[i], in_chans=in_channels[i], flatten=False)
            fpn_conv = DeformConv2dPack(out_channels,out_channels,3,padding=1)

            self.lateral_convs.append(l_conv)
            self.fpn_convs.append(fpn_conv)

        # add extra conv layers (e.g., RetinaNet)
        extra_levels = num_outs - self.backbone_end_level + self.start_level
        if self.add_extra_convs and extra_levels >= 1:
            for i in range(extra_levels):
                if i == 0 and self.add_extra_convs == 'on_input':
                    in_channels = self.in_channels[self.backbone_end_level - 1]
                else:
                    in_channels = out_channels
                extra_fpn_conv = ConvModule(
                    in_channels,
                    out_channels,
                    3,
                    stride=2,
                    padding=1,
                    conv_cfg=conv_cfg,
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg,
                    inplace=False)
                self.fpn_convs.append(extra_fpn_conv)
        print("Fpc ", self.fpn_convs)

    @auto_fp16()
    def forward(self, inputs):
        """Forward function."""
        assert len(inputs) == len(self.in_channels)

        # build laterals
        laterals = [
            lateral_conv(inputs[i + self.start_level])
            for i, lateral_conv in enumerate(self.lateral_convs)
        ]

        # build top-down path
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
            #  it cannot co-exist with `size` in `F.interpolate`.
            if 'scale_factor' in self.upsample_cfg:
                # fix runtime error of "+=" inplace operation in PyTorch 1.10
                laterals[i - 1] = laterals[i - 1] + F.interpolate(
                    laterals[i], **self.upsample_cfg)
            else:
                prev_shape = laterals[i - 1].shape[2:]
                laterals[i - 1] = laterals[i - 1] + F.interpolate(
                    laterals[i], size=prev_shape, **self.upsample_cfg)

        # build outputs
        # part 1: from original levels
        outs = [
            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
        ]
        # print("Fpc ", outs[0].shape)
        # part 2: add extra levels
        # print("aquiii")
        if self.num_outs > len(outs):
            # use max pool to get more levels on top of outputs
            # (e.g., Faster R-CNN, Mask R-CNN)
            if not self.add_extra_convs:
                for i in range(self.num_outs - used_backbone_levels):
                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
            # add conv layers on top of original feature maps (RetinaNet)
            else:
                if self.add_extra_convs == 'on_input':
                    extra_source = inputs[self.backbone_end_level - 1]
                elif self.add_extra_convs == 'on_lateral':
                    extra_source = laterals[-1]
                elif self.add_extra_convs == 'on_output':
                    extra_source = outs[-1]
                else:
                    raise NotImplementedError
                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
                for i in range(used_backbone_levels + 1, self.num_outs):
                    if self.relu_before_extra_convs:
                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
                    else:
                        outs.append(self.fpn_convs[i](outs[-1]))
        return tuple(outs)


In [12]:
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from mmcv.runner import BaseModule, ModuleList, auto_fp16

# from mmocr.models.builder import NECKS


# @NECKS.register_module()
class FPNF(BaseModule):
    """FPN-like fusion module in Shape Robust Text Detection with Progressive
    Scale Expansion Network.
    Args:
        in_channels (list[int]): A list of number of input channels.
        out_channels (int): The number of output channels.
        fusion_type (str): Type of the final feature fusion layer. Available
            options are "concat" and "add".
        init_cfg (dict or list[dict], optional): Initialization configs.
    """

    def __init__(self,
                 in_channels=[256, 512, 44, 2048],
                 out_channels=256,
                 fusion_type='concat',
                 init_cfg=dict(
                     type='Xavier', layer='Conv2d', distribution='uniform')):
        super().__init__(init_cfg=init_cfg)
        conv_cfg = None
        norm_cfg = dict(type='BN')
        act_cfg = dict(type='ReLU')

        self.in_channels = in_channels
        self.out_channels = out_channels

        self.lateral_convs = ModuleList()
        self.fpn_convs = ModuleList()
        self.backbone_end_level = len(in_channels)
        for i in range(self.backbone_end_level):
            l_conv = ConvModule(
                in_channels[i],
                out_channels,
                1,
                conv_cfg=conv_cfg,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg,
                inplace=False)
            self.lateral_convs.append(l_conv)

            if i < self.backbone_end_level - 1:
                fpn_conv = ConvModule(
                    out_channels,
                    out_channels,
                    3,
                    padding=1,
                    conv_cfg=conv_cfg,
                    norm_cfg=norm_cfg,
                    act_cfg=act_cfg,
                    inplace=False)
                self.fpn_convs.append(fpn_conv)

        self.fusion_type = fusion_type

        if self.fusion_type == 'concat':
            feature_channels = 44
        elif self.fusion_type == 'add':
            feature_channels = 256
        else:
            raise NotImplementedError

        self.output_convs = ConvModule(
            feature_channels,
            out_channels,
            3,
            padding=1,
            conv_cfg=None,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg,
            inplace=False)

    @auto_fp16()
    def forward(self, inputs):
        """
        Args:
            inputs (list[Tensor]): Each tensor has the shape of
                :math:`(N, C_i, H_i, W_i)`. It usually expects 4 tensors
                (C2-C5 features) from ResNet.
        Returns:
            Tensor: A tensor of shape :math:`(N, C_{out}, H_0, W_0)` where
            :math:`C_{out}` is ``out_channels``.
        """
        assert len(inputs) == len(self.in_channels)

        # build laterals
        laterals = [
            lateral_conv(inputs[i])
            for i, lateral_conv in enumerate(self.lateral_convs)
        ]

        # build top-down path
        used_backbone_levels = len(laterals)
        for i in range(used_backbone_levels - 1, 0, -1):
            # step 1: upsample to level i-1 size and add level i-1
            prev_shape = laterals[i - 1].shape[2:]
            laterals[i - 1] = laterals[i - 1] + F.interpolate(
                laterals[i], size=prev_shape, mode='nearest')
            # step 2: smooth level i-1
            laterals[i - 1] = self.fpn_convs[i - 1](laterals[i - 1])

        # upsample and cont
        bottom_shape = laterals[0].shape[2:]
        for i in range(1, used_backbone_levels):
            laterals[i] = F.interpolate(
                laterals[i], size=bottom_shape, mode='nearest')

        if self.fusion_type == 'concat':
            out = torch.cat(laterals, 1)
        elif self.fusion_type == 'add':
            out = laterals[0]
            for i in range(1, used_backbone_levels):
                out += laterals[i]
        else:
            raise NotImplementedError
        out = self.output_convs(out)

        return out

In [13]:
import torch
in_channels = [96, 192, 384, 768]
scales = [340, 170, 84, 43]
inputs = [torch.rand(1, c, s, s) for c, s in zip(in_channels, scales)]
demo1= [(1,c,s,s) for c,s in zip(in_channels,scales)]

In [15]:
self_neck = MyNeck(in_channels, 256, num_outs=5).eval()
print()

Fpc  ModuleList(
  (0): DeformConv2dPack(in_channels=256,
  out_channels=256,
  kernel_size=(3, 3),
  stride=(1, 1),
  padding=(1, 1),
  dilation=(1, 1),
  groups=1,
  deform_groups=1,
  bias=False)
  (1): DeformConv2dPack(in_channels=256,
  out_channels=256,
  kernel_size=(3, 3),
  stride=(1, 1),
  padding=(1, 1),
  dilation=(1, 1),
  groups=1,
  deform_groups=1,
  bias=False)
  (2): DeformConv2dPack(in_channels=256,
  out_channels=256,
  kernel_size=(3, 3),
  stride=(1, 1),
  padding=(1, 1),
  dilation=(1, 1),
  groups=1,
  deform_groups=1,
  bias=False)
  (3): DeformConv2dPack(in_channels=256,
  out_channels=256,
  kernel_size=(3, 3),
  stride=(1, 1),
  padding=(1, 1),
  dilation=(1, 1),
  groups=1,
  deform_groups=1,
  bias=False)
)



In [16]:
for i in inputs:
    print(i.shape)

torch.Size([1, 96, 340, 340])
torch.Size([1, 192, 170, 170])
torch.Size([1, 384, 84, 84])
torch.Size([1, 768, 43, 43])


In [17]:
# import timm
# m= timm.create_model('resnet50', features_only=True, pretrained=True)
# o= m(inputs[1])
# for x in o:
#     print(x.shape)

In [18]:
# self2= FPNF(in_channels, 11).eval()
# outputs2 = self2.forward(inputs)
# for i in range(len(outputs2)):
#     print(f'outputs[{i}].shape = {outputs2[i].shape}')
# outputs

In [19]:
outputs = self_neck.forward(inputs)
for i in range(len(outputs)):
    print(f'outputs[{i}].shape = {outputs[i].shape}')

aquiii
outputs[0].shape = torch.Size([1, 256, 340, 340])
outputs[1].shape = torch.Size([1, 256, 170, 170])
outputs[2].shape = torch.Size([1, 256, 84, 84])
outputs[3].shape = torch.Size([1, 256, 43, 43])
outputs[4].shape = torch.Size([1, 256, 22, 22])


In [31]:
inputs[1].shape
# inputs[2]

torch.Size([1, 192, 170, 170])

In [7]:
C

NameError: name 'C' is not defined

In [42]:
N,C,H,W= inputs[0].shape
l_conv = ConvModule(C, 11, 1, inplace=False)
aaa=l_conv(inputs[0])
print(aaa.shape)    
# test1= torch.Conv2d(11, 11, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
fpn_conv = ConvModule(11, 11, 3, padding=1, inplace=False)
aa1=fpn_conv(aaa)
print(aa1.shape)


torch.Size([1, 11, 340, 340])
torch.Size([1, 11, 340, 340])


In [34]:
inputs[0].shape

torch.Size([1, 96, 340, 340])

In [43]:
from timm.models.layers import PatchEmbed
from timm.models.swin_transformer import PatchMerging
import numpy as np
# xi = torch.randn(1, 3, 224, 224)
xi=inputs[0]
print(xi.shape)
N,C,H,W= xi.shape

patch_embed = PatchEmbed(img_size=H, patch_size=2, embed_dim=256, in_chans=C, flatten=False) #stage 1  (N,((h/p_s)*(h/p_s)), Emb_Dim)  pz=4, emb=96
pe=patch_embed(xi) 
print("pe", pe.shape)
res= np.sqrt(pe.shape[1]) if np.sqrt(pe.shape[1])%2 == 0 else np.sqrt(pe.shape[1])-1
print(res)
patch_merg= PatchMerging(input_resolution=(int(res), int(res)), dim=11, 
                        norm_layer=nn.LayerNorm)
#dim= num of input channels
pm=patch_merg(pe)
print(pm.shape)
# patches = patches.contiguous().view(N, C*kernel_size*kernel_size, -1)
# print(patches.shape) # [B, C*prod(kernel_size), L] as expected by Fold

torch.Size([1, 96, 340, 340])
pe torch.Size([1, 256, 170, 170])
16.0


ValueError: too many values to unpack (expected 3)

In [21]:
l2= ConvModule(1,56*56,3)
l2(x).shape
#checar que onda con las medidas de uina imagen real porque esto esta sencillo 

torch.Size([3136, 3134, 94])

In [10]:
# B, C, W, H = 1, 3, 170, 170
# x = torch.randn(B, C, H, W)
x=inputs[1]
N,C,H,W= x.shape

kernel_size = 3
stride = 64
patches = x.unfold(3, kernel_size, stride).unfold(2, kernel_size, stride)
print(patches.shape) # [B, C, nb_patches_h, nb_patches_w, kernel_size, kernel_size]

# perform the operations on each patch
# ...

# reshape output to match F.fold input
patches = patches.contiguous().transpose(1,0).reshape(N, C, -1, kernel_size*kernel_size).transpose(0,1)
print(patches.shape) # [B, C, nb_patches_all, kernel_size*kernel_size]
patches = patches.permute(0, 1, 3, 2) 
print(patches.shape) # [B, C, kernel_size*kernel_size, nb_patches_all]
patches = patches.contiguous().view(N, C*kernel_size*kernel_size, -1)
print(patches.shape) # [B, C*prod(kernel_size), L] as expected by Fold
# https://pytorch.org/docs/stable/nn.html#torch.nn.Fold

output = F.fold(
    patches, output_size=(H, W), kernel_size=kernel_size, stride=stride)
print(output.shape) # [B, C, H, W]

torch.Size([1, 3, 3, 3, 3, 3])
torch.Size([3, 1, 9, 9])
torch.Size([3, 1, 9, 9])
torch.Size([1, 27, 9])
torch.Size([1, 3, 170, 170])


In [11]:
inputs[1]

tensor([[[[0.1497, 0.2643, 0.4580,  ..., 0.6415, 0.8368, 0.9372],
          [0.8278, 0.6953, 0.2105,  ..., 0.8302, 0.9618, 0.6046],
          [0.4854, 0.7722, 0.3938,  ..., 0.9372, 0.9177, 0.4546],
          ...,
          [0.5123, 0.1051, 0.6526,  ..., 0.7913, 0.3375, 0.2487],
          [0.8430, 0.3039, 0.4511,  ..., 0.2935, 0.7278, 0.1007],
          [0.3944, 0.0967, 0.2826,  ..., 0.2515, 0.1209, 0.1671]],

         [[0.9015, 0.2391, 0.8272,  ..., 0.3037, 0.4444, 0.6748],
          [0.7975, 0.6653, 0.6949,  ..., 0.7353, 0.4852, 0.8811],
          [0.1499, 0.4388, 0.0120,  ..., 0.9374, 0.2205, 0.5364],
          ...,
          [0.0409, 0.6899, 0.2713,  ..., 0.6936, 0.8746, 0.2058],
          [0.0594, 0.1545, 0.8081,  ..., 0.7368, 0.9585, 0.7482],
          [0.6078, 0.7251, 0.2487,  ..., 0.6656, 0.7651, 0.9039]],

         [[0.1571, 0.5390, 0.6835,  ..., 0.0258, 0.9691, 0.9158],
          [0.1764, 0.7709, 0.9666,  ..., 0.5147, 0.4872, 0.6460],
          [0.4582, 0.4426, 0.9471,  ..., 0

In [12]:
output

tensor([[[[0.1497, 0.8278, 0.4854,  ..., 0.0000, 0.0000, 0.0000],
          [0.2643, 0.6953, 0.7722,  ..., 0.0000, 0.0000, 0.0000],
          [0.4580, 0.2105, 0.3938,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

         [[0.9015, 0.7975, 0.1499,  ..., 0.0000, 0.0000, 0.0000],
          [0.2391, 0.6653, 0.4388,  ..., 0.0000, 0.0000, 0.0000],
          [0.8272, 0.6949, 0.0120,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],

         [[0.1571, 0.1764, 0.4582,  ..., 0.0000, 0.0000, 0.0000],
          [0.5390, 0.7709, 0.4426,  ..., 0.0000, 0.0000, 0.0000],
          [0.6835, 0.9666, 0.9471,  ..., 0

In [8]:
inputs[0].shape

torch.Size([1, 2, 340, 340])

In [12]:
from timm.models.layers import PatchEmbed
x = torch.randn(1, 3, 224, 224)
patch_embed = PatchEmbed(img_size=224, patch_size=4, embed_dim=96)
patch_embed(x).shape

torch.Size([1, 3136, 96])

In [76]:
import timm
vit=timm.create_model('vit_base_patch16_224', pretrained=False)
vit.eval()
outt= vit(torch.rand(1, 2, 224, 224))
outt.shape

RuntimeError: Given groups=1, weight of size [768, 3, 16, 16], expected input[1, 2, 224, 224] to have 3 channels, but got 2 channels instead