In [1]:
import torch, mmengine, mmcv, mmdet, mmdet3d
print(torch.__version__)
print(torch.version.cuda)
print(mmengine.__version__)
print(mmcv.__version__)
print(mmdet.__version__)
print(mmdet3d.__version__)

  from .autonotebook import tqdm as notebook_tqdm


1.9.0+cu111
11.1
0.10.3
1.6.0
2.24.0
1.0.0rc5


In [None]:
from mmengine.config import Config
from projects.mmdet3d_plugin.models.detectors import CmtDetector
from mmdet3d.registry import MODELS

In [None]:
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
class_names = [
    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size = [0.1, 0.1, 0.2]
out_size_factor = 8
evaluation = dict(interval=20)
dataset_type = 'CustomNuScenesDataset'
data_root = 'data/nuscenes/'
input_modality = dict(
    use_lidar=True,
    use_camera=True,
    use_radar=False,
    use_map=False,
    use_external=False)

config = dict(
    # type='CmtDetector',
    use_grid_mask=True,
    img_backbone=dict(
        type='ResNet',
        depth=50,
        num_stages=4,
        out_indices=(2, 3),
        frozen_stages=-1,
        norm_cfg=dict(type='BN', requires_grad=True),
        norm_eval=True,
        with_cp=True,
        style='pytorch'),
    img_neck=dict(
        type='CPFPN',
        in_channels=[1024, 2048],
        out_channels=256,
        num_outs=2),
    pts_voxel_layer=dict(
        num_point_features=5,
        max_num_points=10,
        voxel_size=voxel_size,
        max_voxels=(120000, 160000),
        point_cloud_range=point_cloud_range),
    pts_voxel_encoder=dict(
        type='HardSimpleVFE',
        num_features=5,
    ),
    pts_middle_encoder=dict(
        type='SparseEncoder',
        in_channels=5,
        sparse_shape=[41, 1024, 1024],
        output_channels=128,
        order=('conv', 'norm', 'act'),
        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
        block_type='basicblock'),
    pts_backbone=dict(
        type='SECOND',
        in_channels=256,
        out_channels=[128, 256],
        layer_nums=[5, 5],
        layer_strides=[1, 2],
        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
        conv_cfg=dict(type='Conv2d', bias=False)),
    pts_neck=dict(
        type='SECONDFPN',
        in_channels=[128, 256],
        out_channels=[256, 256],
        upsample_strides=[1, 2],
        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
        upsample_cfg=dict(type='deconv', bias=False),
        use_conv_for_no_stride=True),
    pts_bbox_head=dict(
        type='CmtHead',
        in_channels=512,
        hidden_dim=256,
        downsample_scale=8,
        common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
         tasks=[
            dict(num_class=10, class_names=[
                'car', 'truck', 'construction_vehicle',
                'bus', 'trailer', 'barrier',
                'motorcycle', 'bicycle',
                'pedestrian', 'traffic_cone'
            ]),
        ],
        bbox_coder=dict(
            type='MultiTaskBBoxCoder',
            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
            pc_range=point_cloud_range,
            max_num=300,
            voxel_size=voxel_size,
            num_classes=10), 
        separate_head=dict(
            type='SeparateTaskHead', init_bias=-2.19, final_kernel=1),
        transformer=dict(
            type='CmtTransformer',
            decoder=dict(
                type='PETRTransformerDecoder',
                return_intermediate=True,
                num_layers=6,
                transformerlayers=dict(
                    type='PETRTransformerDecoderLayer',
                    with_cp=False,
                    attn_cfgs=[
                        dict(
                            type='MultiheadAttention',
                            embed_dims=256,
                            num_heads=8,
                            dropout=0.1),
                        dict(
                            type='PETRMultiheadFlashAttention',
                            embed_dims=256,
                            num_heads=8,
                            dropout=0.1),
                        ],
                    ffn_cfgs=dict(
                        type='FFN',
                        embed_dims=256,
                        feedforward_channels=1024,
                        num_fcs=2,
                        ffn_drop=0.,
                        act_cfg=dict(type='ReLU', inplace=True),
                    ),

                    feedforward_channels=1024, #unused
                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
                                     'ffn', 'norm')),
            )),
        loss_cls=dict(type='mmdet.FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=2.0),
        loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=0.25),
        loss_heatmap=dict(type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
    ),
    train_cfg=dict(
        pts=dict(
            dataset='nuScenes',
            assigner=dict(
                type='HungarianAssigner3D',
                # cls_cost=dict(type='ClassificationCost', weight=2.0),
                cls_cost=dict(type='FocalLossCost', weight=2.0),
                reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
                iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. 
                pc_range=point_cloud_range,
                code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
            ),
            pos_weight=-1,
            gaussian_overlap=0.1,
            min_radius=2,
            grid_size=[1024, 1024, 40],  # [x_len, y_len, 1]
            voxel_size=voxel_size,
            out_size_factor=out_size_factor,
            code_weights=[2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
            point_cloud_range=point_cloud_range)),
    test_cfg=dict(
        pts=dict(
            dataset='nuScenes',
            grid_size=[1024, 1024, 40],
            out_size_factor=out_size_factor,
            pc_range=point_cloud_range,
            voxel_size=voxel_size,
            nms_type=None,
            nms_thr=0.2,
            use_rotate_nms=True,
            max_num=200
        )))

In [None]:
cfg = Config(config)

In [None]:
model = CmtDetector(**cfg)

In [None]:
model.cuda()
model.eval()

In [None]:
import torch

In [None]:
img = torch.rand(3, 6, 3, 800, 320).cuda()
img_metas = {}
points = torch.rand(3, 50000, 5).cuda()
points[:, :, :2] *= 100
img_feats, pts_feats = model.extract_feat(points, img=img, img_metas=img_metas)
len(img_feats), img_feats[0].shape, img_feats[1].shape, len(pts_feats), pts_feats[0].shape

In [None]:
from mmdet.models.utils import multi_apply
def forward_single(pts_feat, img_feat, img_meta):
    print(len(pts_feat), len(img_feat), len(img_meta))
    print(pts_feat.shape, img_feat.shape, img_meta)
    return pts_feat, img_feat, img_meta

img_metas = [img_metas for _ in range(len(pts_feats))]
result = multi_apply(forward_single, pts_feats, img_feats, img_metas)
len(result), len(result[0]), len(result[1])

In [None]:
img = torch.rand(3, 6, 3, 800, 320).cuda()
img_metas = [{'pad_shape': [(800, 320, 3)]}]
points = torch.rand(3, 50000, 5).cuda()
points[:, :, :2] *= 100
model.predict([points], [img_metas], [img])

In [None]:
img_metas = [{'pad_shape': [(800, 320, 3)]}]
img_metas[0]['pad_shape'][0]

In [None]:
model.pts_bbox_head.transformer

In [6]:
import argparse
import logging
import os
import os.path as osp
import torch

from mmengine.config import Config, DictAction
from mmengine.logging import print_log
from mmengine.registry import RUNNERS
from mmengine.runner import Runner

from mmdet3d.utils import replace_ceph_backend

from projects.mmdet3d_plugin.models.detectors import CmtDetector

In [2]:
cfg = Config.fromfile('projects/configs/fusion/my_cmt.py')
# cfg = Config.fromfile('../mmdetection3d/projects/BEVFusion/configs/my_bevfusion.py')
cfg.work_dir = osp.abspath('./work_dirs')
runner = Runner.from_cfg(cfg)

  warn(


03/11 22:34:12 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: linux
    Python: 3.8.18 | packaged by conda-forge | (default, Oct 10 2023, 15:44:36) [GCC 12.3.0]
    CUDA available: True
    MUSA available: False
    numpy_random_seed: 270868770
    GPU 0: NVIDIA GeForce RTX 3060
    CUDA_HOME: /usr/local/cuda-12.2
    NVCC: Cuda compilation tools, release 12.2, V12.2.91
    GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0
    PyTorch: 2.1.2
    PyTorch compiling details: PyTorch built with:
  - GCC 9.3
  - C++ Version: 201703
  - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX2
  - CUDA Runtime 12.1
  - NVCC architecture f



03/11 22:34:16 - mmengine - [4m[97mINFO[0m - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.
03/11 22:34:16 - mmengine - [4m[97mINFO[0m - Hooks will be executed in the following order:
before_run:
(VERY_HIGH   ) RuntimeInfoHook                    
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
before_train:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
before_train_epoch:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
(NORMAL      ) DistSamplerSeedHook                
 -------------------- 
before_train_iter:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
 -------------------- 
after_train_iter:
(VERY_HIGH   ) Runti

In [None]:
runner.model.train()

In [None]:
# 训练过程
for data_batch in runner.train_dataloader:
    data_batch = runner.model.data_preprocessor(data_batch, training=True)
    if isinstance(data_batch, dict):
        losses = runner.model(**data_batch, mode='loss')
    elif isinstance(data_batch, (list, tuple)):
        losses = runner.model(*data_batch, mode='loss')
    else:
        raise TypeError()
    break

In [None]:
losses

In [3]:
runner.model.eval()

CmtDetector(
  (data_preprocessor): Det3DDataPreprocessor()
  (pts_voxel_encoder): HardSimpleVFE()
  (pts_middle_encoder): SparseEncoder(
    (conv_input): SparseSequential(
      (0): SubMConv3d(5, 16, kernel_size=[3, 3, 3], stride=[1, 1, 1], padding=[1, 1, 1], dilation=[1, 1, 1], output_padding=[0, 0, 0], bias=False, algo=ConvAlgo.MaskImplicitGemm)
      (1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (encoder_layers): SparseSequential(
      (encoder_layer1): SparseSequential(
        (0): SparseBasicBlock(
          (conv1): SubMConv3d(16, 16, kernel_size=[3, 3, 3], stride=[1, 1, 1], padding=[1, 1, 1], dilation=[1, 1, 1], output_padding=[0, 0, 0], bias=False, algo=ConvAlgo.MaskImplicitGemm)
          (bn1): BatchNorm1d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (conv2): SubMConv3d(16, 16, kernel_size=[3, 3, 3], stride=[1, 1, 1], padding=[1, 1, 1], dilation=[1, 1, 1], ou

In [7]:
# 验证过程
for data_batch in runner.val_dataloader:
    data_batch = runner.model.data_preprocessor(data_batch, training=False)
    if isinstance(data_batch, dict):
        outputs = runner.model(**data_batch, mode='predict')
    elif isinstance(data_batch, (list, tuple)):
        outputs = runner.model(**data_batch, mode='predict')
    else:
        raise TypeError()
    runner.test_evaluator.process(data_samples=outputs, data_batch=data_batch)
    break
with torch.no_grad():
    metrics = runner.test_evaluator.evaluate(len(runner.val_dataloader.dataset))

  self.post_center_range = torch.tensor(



Formating bboxes of pred_instances_3d
Start to convert detection format...
[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] 3/3, 22.6 task/s, elapsed: 0s, ETA:     0s
Results writes to /tmp/tmpmjgpah6a/results/pred_instances_3d/results_nusc.json
Evaluating bboxes of pred_instances_3d


AssertionError: Database version not found: data/nuscenes/v1.0-trainval

In [None]:
len(outputs), outputs[0].keys(), outputs[0]['pts_bbox'].keys(), outputs[0]['pts_bbox']['bboxes_3d'].shape

In [None]:
outputs[0]['pts_bbox']['labels_3d'].shape

In [None]:
outputs[0]