初始化测试函数

In [1]:
import matplotlib
import torch
import torch.nn as nn
import numpy as np 
import math
import matplotlib.pyplot as plt
import torch.nn.functional as F
import addict
%matplotlib inline
from thop import profile
from thop import clever_format
from torchstat import stat

def show_model_stat(model, img_size=(3, 256, 256)):
    stat(model, img_size)


def show_macs_params(model, img_size=(256, 256), dummy_input=None):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model.to(device)
    model.eval()
    if dummy_input is None:
        dummy_input = torch.randn(1, 3, img_size[0], img_size[1], dtype=torch.float)
    dummy_input = dummy_input.to(device)
    
     # macs == FLOPS, GFLOPS == 1e12 * FLOPS
    macs, params = profile(model, inputs=(dummy_input,), verbose=False) 
    print(f"{model._get_name()}\t\t{macs=}\t{params=}")
    print("FLOPs=", str(macs/1e9) +'{}'.format("G"), end='\t')
    print("params=", str(params/1e6)+'{}'.format("M"))
    macs, params = clever_format([macs, params], "%.3f")
    print(f"{macs=}\t{params=}")


def inference_speed(model, img_size=(256, 256), dummy_input=None, repetitions=10000):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model.to(device)
    model.eval()
    if dummy_input is None:
        dummy_input = torch.randn(1, 3, img_size[0], img_size[1], dtype=torch.float)
    dummy_input = dummy_input.to(device)

    starter = torch.cuda.Event(enable_timing=True)
    ender = torch.cuda.Event(enable_timing=True)

    
    timings=np.zeros((repetitions,1))
    #GPU-WARM-UP
    for _ in range(10):
        _ = model(dummy_input)
    # MEASURE PERFORMANCE
    with torch.no_grad():
        for rep in range(repetitions):
            starter.record()
            _ = model(dummy_input)
            ender.record()
            # WAIT FOR GPU SYNC
            torch.cuda.synchronize()
            curr_time = starter.elapsed_time(ender)
            timings[rep] = curr_time
    mean_syn = np.sum(timings) / repetitions
    std_syn = np.std(timings)
    mean_fps = 1000. / mean_syn
    print('Mean@ {mean_syn:.3f}ms Std@ {std_syn:.3f}ms FPS@ {mean_fps:.2f}'\
        .format(mean_syn=mean_syn, std_syn=std_syn, mean_fps=mean_fps))
    # ! @n 中的n是什么意思？
    # print(' * Mean@1 {mean_syn:.3f}ms Std@5 {std_syn:.3f}ms FPS@1 {mean_fps:.2f}'\
    #     .format(mean_syn=mean_syn, std_syn=std_syn, mean_fps=mean_fps))

In [3]:
cfg_dict = dict(
    mynet=dict(MODEL=dict(name='mynet',
                          input_channel=128,
                          output_channel=21,       # num_joints + 3 region map
                          num_stage=4,
                          num_block=[2, 2, 2],
                          )),
    litehandnet=dict(MODEL=dict(
            name='litehandnet',
            num_stage=4,
            num_block=[2, 2, 2],
            input_channel=128,
            ca_type='ca',
            reduction=4,
            activation="leakyrelu", # 'leakyrelu', 'relu', 'silu'
            output_channel=21,
    )),
    resnet18=dict(MODEL=dict(
                        name='resnet',
                        depth=18,
                        output_channel=21,  # num_joints + 3 region map
                        stem_channels=64,
                        base_channels=64,
                        strides=(1, 2, 2, 2),
                        deep_stem=False,         # stem是否用三个3x3卷积代替7x7卷积
                        num_stages=4,
                        out_indices=(3,),         # (0, 1, 2, 3)
                        )),
    resnet50=dict(MODEL=dict(
                        name='resnet',
                        depth=50,
                        output_channel=21,  # num_joints + 3 region map
                        stem_channels=64,
                        base_channels=64,
                        strides=(1, 2, 2, 2),
                        deep_stem=False,         # stem是否用三个3x3卷积代替7x7卷积
                        num_stages=4,
                        out_indices=(3,),         # (0, 1, 2, 3)
                        )),
    mobilenetv2=dict(MODEL=dict(
                        name='mobilenetv2',
                        widen_factor=1,
                        out_indices=(7,),
                        output_channel=21,
                        )),
    litehrnet18=dict(MODEL=dict(
                        name='litehrnet',
                        depth=18,
                        output_channel=21,  # num_joints + 3 region map
                        )),
    litehrnet30=dict(MODEL=dict(
                        name='litehrnet',
                        depth=30,
                        output_channel=21,  # num_joints + 3 region map
                        )),
    hourglass_2=dict(MODEL=dict(
                        name='hourglass',
                        input_channel=256,
                        output_channel=21,       # num_joints + 3 region map
                        num_stack=2,            # 沙漏模块的个数
                        num_level=4,            # 每个沙漏模块不同尺度特征层个数
                        )),
    hourglass_1=dict(MODEL=dict(
                        name='hourglass',
                        input_channel=256,
                        output_channel=21,       # num_joints + 3 region map
                        num_stack=1,            # 沙漏模块的个数
                        num_level=4,            # 每个沙漏模块不同尺度特征层个数
                        )),
    srhandnet=dict(MODEL=dict(
                        name='srhandnet',
                        output_channel=21,  # num_joints + 3 region map
                        )),
)

1、测试所有模型的参数和推理速度

In [3]:
from models import srhandnet, litehrnet, mynet, resnet, mobilenetv2, hourglass, litehandnet
from copy import deepcopy

# for img_size in [(224, 224), (256, 256)]:
#     for model_name, cfg in cfg_dict.items():
#         print(f"{model_name}\t{img_size}".center(80, '-'))
#         if "hourglass" in model_name and img_size == (224, 224):
#             cfg = deepcopy(cfg)
#             cfg['MODEL']['num_level'] = 3
#         cfg = addict.Dict(cfg)
#         model = eval(cfg.MODEL.name)(cfg)
#         show_macs_params(model, img_size=img_size)
#         inference_speed(model, img_size=img_size)
#         print()
#         if model_name == 'litehandnet':
#             for m in model.modules():
#                 if hasattr(m, 'switch_to_deploy'):
#                     m.switch_to_deploy()
#             show_macs_params(model, img_size=img_size)
#             inference_speed(model, img_size=img_size)
#             print()

2、测试所有模型在同一输入下的参数和推理速度

In [4]:
x = torch.rand(1, 3, 256, 256)
for model_name, cfg in cfg_dict.items():
    print(f"{model_name}".center(80, '-'))
    cfg = addict.Dict(cfg)
    model = eval(cfg.MODEL.name)(cfg)
    # show_model_stat(model)
    show_macs_params(model, dummy_input=x)
    inference_speed(model, dummy_input=x)
    print()
    if model_name == 'litehandnet':
        model.deploy_model()
        show_macs_params(model, dummy_input=x)
        inference_speed(model, dummy_input=x)
        print()

-------------------------------------mynet--------------------------------------


  kernel = torch.DoubleTensor([*(x[0].shape[2:])]) // torch.DoubleTensor(list((m.output_size,))).squeeze()


MultiScaleAttentionHourglass		macs=1135802240.0	params=2240405.0
FLOPs= 1.13580224G	params= 2.240405M
macs='1.136G'	params='2.240M'
Mean@ 71.053ms Std@ 20.139ms FPS@ 14.07

----------------------------------litehandnet-----------------------------------
LiteHandNet		macs=1312865536.0	params=2272981.0
FLOPs= 1.312865536G	params= 2.272981M
macs='1.313G'	params='2.273M'
Mean@ 76.100ms Std@ 19.763ms FPS@ 13.14

LiteHandNet		macs=1297824768.0	params=2265621.0
FLOPs= 1.297824768G	params= 2.265621M
macs='1.298G'	params='2.266M'
Mean@ 63.758ms Std@ 21.498ms FPS@ 15.68

------------------------------------resnet18------------------------------------
PoseResNet		macs=8307625984.0	params=15381589.0
FLOPs= 8.307625984G	params= 15.381589M
macs='8.308G'	params='15.382M'
Mean@ 49.663ms Std@ 16.047ms FPS@ 20.14

------------------------------------resnet50------------------------------------
PoseResNet		macs=12070785024.0	params=30615861.0
FLOPs= 12.070785024G	params= 30.615861M
macs='12.071G'	params=

观察消融实验的结果，可以看出MS_Att模块中的MSRB-DWConv运算速度非常慢

In [5]:
from models import hourglass_ablation
cfgs = {
    0:dict(MODEL=dict(
            name='hourglass_ablation',
            input_channel=256,
            output_channel=21,       # num_joints + 3 region map
            num_stack=2,            # 沙漏模块的个数
            num_level=4,            # 每个沙漏模块不同尺度特征层个数
            pelee_stem=dict(enabled=False, focus=False),
            msrb_att=dict(enabled=False, att_enabled=False),
            pred_bbox=True,          # 模型是否预测边界框, 是则不进行旋转变换
            )),
    1:dict(MODEL=dict(
            name='hourglass_ablation',
            input_channel=256,
            output_channel=21,       # num_joints + 3 region map
            num_stack=2,            # 沙漏模块的个数
            num_level=4,            # 每个沙漏模块不同尺度特征层个数
            pelee_stem=dict(enabled=True, focus=False),
            msrb_att=dict(enabled=False, att_enabled=False),
            pred_bbox=True,          # 模型是否预测边界框, 是则不进行旋转变换
            )),
    2:dict(MODEL=dict(
            name='hourglass_ablation',
            input_channel=256,
            output_channel=21,       # num_joints + 3 region map
            num_stack=2,            # 沙漏模块的个数
            num_level=4,            # 每个沙漏模块不同尺度特征层个数
            pelee_stem=dict(enabled=False, focus=False),
            msrb_att=dict(enabled=True, att_enabled=False),
            pred_bbox=True,          # 模型是否预测边界框, 是则不进行旋转变换
            )),
    3:dict(MODEL=dict(
            name='hourglass_ablation',
            input_channel=256,
            output_channel=21,       # num_joints + 3 region map
            num_stack=2,            # 沙漏模块的个数
            num_level=4,            # 每个沙漏模块不同尺度特征层个数
            pelee_stem=dict(enabled=False, focus=False),
            msrb_att=dict(enabled=True, att_enabled=True),
            pred_bbox=True,          # 模型是否预测边界框, 是则不进行旋转变换
            )),
}
for img_size in [(224, 224), (256, 256)]:
    for cfg_idx in range(len(cfgs)):
        print(f"{cfg_idx=}\t{img_size}".center(80, '-'))
        cfg = addict.Dict(cfgs[cfg_idx]) 
        model = hourglass_ablation(cfg)
        show_macs_params(model, img_size=(256, 256))
        inference_speed(model, img_size=(256, 256))
        print()

------------------------------cfg_idx=0	(224, 224)------------------------------
hourglass_ablation		macs=4390493952.0	params=8894229.0
FLOPs= 4.390493952G	params= 8.894229M
macs='4.390G'	params='8.894M'
Mean@ 100.644ms Std@ 25.156ms FPS@ 9.94

------------------------------cfg_idx=1	(224, 224)------------------------------
hourglass_ablation		macs=4390493952.0	params=8894229.0
FLOPs= 4.390493952G	params= 8.894229M
macs='4.390G'	params='8.894M'
Mean@ 98.995ms Std@ 22.644ms FPS@ 10.10

------------------------------cfg_idx=2	(224, 224)------------------------------
hourglass_ablation		macs=4390493952.0	params=8894229.0
FLOPs= 4.390493952G	params= 8.894229M
macs='4.390G'	params='8.894M'
Mean@ 96.701ms Std@ 24.197ms FPS@ 10.34

------------------------------cfg_idx=3	(224, 224)------------------------------
hourglass_ablation		macs=4390493952.0	params=8894229.0
FLOPs= 4.390493952G	params= 8.894229M
macs='4.390G'	params='8.894M'
Mean@ 99.339ms Std@ 23.206ms FPS@ 10.07

--------------------

In [None]:
from turtle import forward
import torch
from torch import nn
from torch.nn import functional as F
from models import kaiming_init, constant_init, normal_init


class CBL(nn.Module):
    def __init__(self, in_channel, out_channel, kernel=1,
                 stride=1, padding=0, dilation=1, groups=1):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channel, out_channel, kernel, stride,
                      padding, dilation, groups, bias=False),
            nn.BatchNorm2d(out_channel),
            nn.LeakyReLU(inplace=True)
        )
    def forward(self, x):
        return self.conv(x)

class ChannelAttension(nn.Module):
    def __init__(self, channel):
        super().__init__()
        self.att = nn.Sequential(
                    nn.AdaptiveAvgPool2d((3,3)),
                    nn.BatchNorm2d(channel),
                    nn.ReLU(),
                    nn.Conv2d(channel, channel, 3, 1, 0, groups=channel),
                    nn.Flatten(),
                    nn.Dropout(p=0.3),
                    nn.Linear(channel, channel),
                    nn.Sigmoid(),  
                    )
    def forward(self, x):
        b, c, _, _ = x.shape
        x = x * self.att(x).view(b, c, 1, 1)
        return x


class MSRB(nn.Module):
    def __init__(self, channel):
        super().__init__()
        c1 = channel // 2
        c2 = channel * 2
        self.conv11 = CBL(c1, c1, 3, 1, 1, 1, groups=c1)
        self.conv12 = CBL(c1, c1, 3, 1, 2, 2, groups=c1)
        self.conv13 = CBL(channel, channel)   # 信息融合，交换
        self.conv21 = CBL(channel, channel, 3, 1, 1, 1, groups=channel)
        self.conv22 = CBL(channel, channel, 3, 1, 2, 2, groups=channel)
        # 信息融合，交换， 变通道
        self.conv23 = nn.Conv2d(c2, channel, 1, 1, 0, bias=False)   

        self.ca = ChannelAttension(channel)
    def forward(self, x):
        x1, x2 = torch.chunk(x, 2, dim=1)
        a1 = self.conv11(x1)
        a2 = self.conv12(x2)
        a = torch.cat([a1, a2], dim=1)
        a = self.conv13(a)
        b1 = self.conv21(a)
        b2 = self.conv22(a)
        b = torch.cat([b1, b2], dim=1)
        b = self.conv23(b)
        out =  b + x  # skip conection
        out = self.ca(out)
        return out

class CSP_MSRB(nn.Module):
    def __init__(self, in_channel, out_channel):
        super().__init__()
        assert in_channel % 2 == 0
        mid_channel = in_channel // 2
        self.conv1 = CBL(in_channel, mid_channel)
        self.msrb = MSRB(mid_channel)
        self.conv2 = nn.Conv2d(mid_channel, mid_channel, 1, 1, 0)
        self.conv3 = nn.Sequential(
            nn.BatchNorm2d(in_channel),
            nn.LeakyReLU(inplace=True),
            CBL(in_channel, out_channel)
        )
    def forward(self, x):
        x = self.conv1(x)
        x1 = self.msrb(x)
        x2 = self.conv2(x)
        out = self.conv3(torch.cat([x1, x2], dim=1))
        return out


class EncoderDecoder(nn.Module):
    def __init__(self, num_stage=4, num_block=[2,2,2,2], in_channel=128):
        super().__init__()
        self.num_levels = num_stage
        self.encoder = nn.ModuleList([])
        self.decoder = nn.ModuleList([])

        self.maxpool = nn.MaxPool2d(2, 2)
        for i in range(num_stage):
            nb = num_block[i]
            self.encoder.append(
                nn.Sequential(*[CSP_MSRB(in_channel, in_channel) for _ in range(nb)])
            )
            self.decoder.append(
                 nn.Sequential(*[CSP_MSRB(in_channel, in_channel) for _ in range(nb)])
            )


    def forward(self, x):
        out_encoder = []   # [128, 64, 32, 16, 8, 4]
        out_decoder = []   # [4, 8, 16, 32, 64, 128]

        # encoder 
        for i in range(self.num_levels):
            x = self.encoder[i](x)
            out_encoder.append(x)
            if i != self.num_levels - 1:
                x = self.maxpool(x)

        # decoder
        for i in range(self.num_levels-1, -1, -1):
            counterpart = out_encoder[i]
            if i == self.num_levels-1:
                x = self.decoder[i](counterpart)
            else:
                h, w = counterpart.shape[2:]
                x = F.interpolate(x, size=(h, w))
                x = x + counterpart
                x = self.decoder[i](x)
            out_decoder.append(x)
        return tuple(out_decoder) 

class Stem(nn.Module):
    def __init__(self, channel):
        super().__init__()
        mid_channel = max(channel // 4, 32)
        self.conv1 = nn.Sequential(
            CBL(3, mid_channel, 3, 2, 1),
            CBL(mid_channel, mid_channel, 3, 1, 1, groups=mid_channel)
        )
        self.branch1 = nn.Sequential(
            CBL(mid_channel, mid_channel),
            CBL(mid_channel, mid_channel, 3, 2, 1)
        )
        self.branch2 = nn.MaxPool2d(2, 2, ceil_mode=True)
        self.conv2 = nn.Conv2d(2*mid_channel, channel, 1, 1, 0)

    def forward(self, x):
        out = self.conv1(x)
        b1 = self.branch1(out)
        b2 = self.branch1(out)
        out = self.conv2(torch.cat([b1, b2], dim=1))
        return out

class LiteHandNet(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        num_stage=cfg.MODEL.get('num_stage', 4)
        num_block=cfg.MODEL.get('num_block', [2, 2, 2, 2])
        input_channel=cfg.MODEL.get('input_channel', 128)
        output_channel=cfg.MODEL.get('output_channel', cfg.DATASET.num_joints)

        self.stem = Stem(input_channel)
        self.backone = EncoderDecoder(num_stage, num_block, input_channel)
        self.neck = CBL(input_channel, input_channel)
        self.head = nn.Conv2d(input_channel, output_channel, 1, 1, 0)
        self.init_weights()

    def forward(self, x):
        out_stem = self.stem(x)
        out_backbone = self.backone(out_stem)

        out_backbone_last_stage = out_backbone[-1]
        out_neck = self.neck(out_backbone_last_stage)
        out = self.head(out_neck)
        return out

    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                # kaiming_init(m)
                normal_init(m)
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                constant_init(m, 1)
        

In [None]:
import addict
cfg = dict(MODEL=dict(
    num_stage=4,
    num_block=[1, 4, 6, 6],
    increase=False,
    input_channel=128,
    output_channel=21,
))
cfg = addict.Dict(cfg)

model = LiteHandNet(cfg)
for img_size in [(224, 224), (256, 256)]:
        print(f"{img_size}".center(80, '-'))
        show_macs_params(model, img_size=(256, 256))
        inference_speed(model, img_size=(256, 256))
        print()

3、统计各个数据集的图片个数

In [5]:
import os
import json
from pathlib import Path
freihand_ann_root = "data/handset/freihand/annotations"
rhd_ann_root = "data/handset/OneHand10K/annotations"
panoptic_ann_root = "data/handset/panoptic/annotations"
onehand10k_ann_root = "data/handset/RHD/annotations"

for ann_root in [freihand_ann_root, rhd_ann_root, panoptic_ann_root, onehand10k_ann_root]:
    files = os.listdir(ann_root)
    ann_root = Path(ann_root)
    for file in files:
        ann_file = ann_root.joinpath(file)
        json_file = json.load(ann_file.open(mode='r'))
        print(f"{file} => {len(json_file['images'])}")
        print(f"{file} => {len(json_file['annotations'])}")
    print()

freihand_train.json => 104192
freihand_train.json => 104192
freihand100.json => 100
freihand100.json => 100
freihand_val.json => 13024
freihand_val.json => 13024
freihand_test.json => 13024
freihand_test.json => 13024
freihand100_train.json => 100
freihand100_train.json => 100
freihand100_val.json => 100
freihand100_val.json => 100
freihand_train_6400.json => 6400
freihand_train_6400.json => 6400
freihand_val_6400.json => 6400
freihand_val_6400.json => 6400

onehand10k_test.json => 1703
onehand10k_test.json => 1703
onehand10k_train.json => 10000
onehand10k_train.json => 10000

panoptic_test.json => 846
panoptic_test.json => 846
panoptic_train.json => 16729
panoptic_train.json => 16729

rhd_test.json => 2727
rhd_test.json => 2727
rhd_train.json => 41255
rhd_train.json => 41255



In [2]:
from models import atthandnet
model = atthandnet(None)
model.eval()
x = torch.rand(1, 3, 224, 224)
y = model(x)
print(f"{y.shape=}")
show_macs_params(model, dummy_input=x)
inference_speed(model, dummy_input=x, repetitions=500)

y.shape=torch.Size([1, 21, 2])
light_Model		macs=2253977686.0	params=1894628.0
FLOPs= 2.253977686G	params= 1.894628M
macs='2.254G'	params='1.895M'
Mean@ 233.892ms Std@ 3.600ms FPS@ 4.28


In [5]:
from models import mynet
cfg=dict(MODEL=dict(name='mynet',
                        input_channel=160,
                        output_channel=21,       # num_joints + 3 region map
                        num_stage=4,
                        num_block=[2, 2, 2],
                        ))
model = mynet(addict.Dict(cfg))
x = torch.rand(1, 3, 256, 256)
show_macs_params(model, dummy_input=x)
inference_speed(model, dummy_input=x, repetitions=500)

MultiScaleAttentionHourglass		macs=1750821216.0	params=3490101.0
FLOPs= 1.750821216G	params= 3.490101M
macs='1.751G'	params='3.490M'
Mean@ 17.776ms Std@ 0.479ms FPS@ 56.25


In [9]:
x = torch.rand(1, 3, 256, 256)
y = x.view(3, -1)
y.shape

torch.Size([3, 65536])

AttributeError: module 'torch' has no attribute 'view'

RuntimeError: The size of tensor a (256) must match the size of tensor b (65536) at non-singleton dimension 3