2023.11.26 尝试实现SSD目标检测模型

In [1]:
%matplotlib inline
import torch
import torchvision
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

In [2]:
def cls_predictor(num_inputs, num_anchors, num_classes):
    """ 类别预测层：输入输出大小不变，输出通道表示输入上每个像素第i个锚框第j类的预测 i(q+1)+j\n
        @num_inputs:输入通道数\n
        @num_anchors:特征图每个像素位置生成num_anchors个锚框，即a\n
        @num_classes:共有num_classes个类别，即q\n"""
    return nn.Conv2d(in_channels=num_inputs, out_channels=num_anchors * (num_classes + 1),
                     kernel_size=3, padding=1)

In [3]:
def bbox_predictor(num_inputs, num_anchors):
    """ 边界框预测层：类似cls_predictor，输入输出大小不变，为每个anchor预测4个位置偏移量 """
    return nn.Conv2d(in_channels=num_inputs, out_channels=num_anchors * 4,
                     kernel_size=3, padding=1)

In [5]:
# 连结多尺度的预测
def forward(x, block):
    """ 即执行 block(x) """
    return block(x)

Y1 = forward(torch.zeros(size=(2, 8, 20, 20)), cls_predictor(8, 5, 10))     # 输入特征图BCHW
Y2 = forward(torch.zeros(size=(2, 16, 10, 10)), cls_predictor(16, 3, 10))   # 输入特征图BCHW
Y1.shape, Y2.shape      # [2, 5 * (10 + 1) = 55, 20, 20]    [2, 3 * (10 + 1) = 33, 10, 10]，将这2个尺度的预测结果连接起来，提高计算效率

(torch.Size([2, 55, 20, 20]), torch.Size([2, 33, 10, 10]))

In [7]:
def flatten_pred(pred):
    """ 将[B,C,H,W]展平为[B,H*W*C] """
    return torch.flatten(input=pred.permute(0, 2, 3, 1), start_dim=1)

def concat_preds(preds):
    """ 将多个[B,H*W*C]在维度1上连结起来 """
    return torch.cat([flatten_pred(p) for p in preds], dim=1)

concat_preds([Y1, Y2]).shape

torch.Size([2, 25300])

In [10]:
def down_sample_blk(in_channels, out_channels):
    """ 高宽减半块 2*Conv + 1*MaxPool \n
        通道数 """
    blk = []
    for _ in range(2):
        blk.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        blk.append(nn.BatchNorm2d(out_channels))
        blk.append(nn.ReLU())
        in_channels = out_channels
    blk.append(nn.MaxPool2d(kernel_size=2))     # MaxPool2d stride默认等于kernel_size
    return nn.Sequential(*blk)

forward(torch.zeros(size=(2, 3, 20, 20)), down_sample_blk(in_channels=3, out_channels=10)).shape      # [2,3,20,20] --> [2,10,10,10]

torch.Size([2, 10, 10, 10])

In [11]:
def base_net():
    """ 基本网络块，抽取特征 3*down_sample_blk """
    blk = []
    num_filters = [3, 16, 32, 64]
    for i in range(len(num_filters) - 1):
        blk.append(down_sample_blk(in_channels=num_filters[i], out_channels=num_filters[i + 1]))
    return nn.Sequential(*blk)

forward(torch.zeros(size=(2, 3, 256, 256)), base_net()).shape       # [2, 3, 256, 256] --> [2, 64, 256/2^3=32, 256/2^3=32]

torch.Size([2, 64, 32, 32])

In [None]:
""" 完整的模型 """
def get_blk(i):
    """ 用于构建完整模型 """
    if i == 0:          # 索引第0个模块，base_net
        return base_net()
    elif i == 1:        # 索引第1个模块，down_sample_blk
        return down_sample_blk(in_channels=3, out_channels=128)
    elif i == 4:        # 索引第4个模块，GAP层
        return nn.AdaptiveAvgPool2d(output_size=(1, 1))
    else:               # 索引第2、3个模块，down_sample_blk
        return down_sample_blk(in_channels=128, out_channels=128)

