# ESAM 在线 DINOv2 数据增强与 2D–3D 对齐快速检查

本 notebook 用于在 ESAM 环境下，对新引入的在线 DINOv2 + 数据增强管线做最小闭环验证：

1. 从 `ESAM_sv_scannet200_CA_dino.py` 加载配置；
2. 构建 ScanNet200-SV 训练数据集与 DataLoader，检查 `img` / `cam_info` / `points` 等字段；
3. 构建 `ScanNet200MixFormer3D` 模型（包含 `DINOv2Backbone`），跑一次前向提取特征；
4. 观察 DINO FPN 构建的日志与输出形状，确认 2D–3D 对齐链路工作正常。


In [1]:
# 0. 环境与基础导入（在 ESAM conda/venv 环境下运行）
import os
import sys

import torch

from mmengine.config import Config
from mmengine.dataset import DefaultSampler, default_collate
from torch.utils.data import DataLoader

from mmdet3d.registry import MODELS, DATASETS
from mmengine.runner import load_checkpoint

# 显式将工作目录和 sys.path 切换到本地 3D_Reconstruction 根目录，
# 确保优先使用本仓库的 oneformer3d 而不是可能安装在 site-packages 里的版本。
repo_root = '/home/nebula/xxy/3D_Reconstruction'
os.chdir(repo_root)
# 去重后强制插到 sys.path[0]
sys.path = [p for p in sys.path if p != repo_root]
sys.path.insert(0, repo_root)
print('CWD =', os.getcwd())
print('sys.path[0] =', sys.path[0])



CWD = /home/nebula/xxy/3D_Reconstruction
sys.path[0] = /home/nebula/xxy/3D_Reconstruction


In [2]:
# 1. 加载 ESAM DINO 配置（可在这里切换为 work_dirs 下的 ft 版本）
cfg_path = os.path.join('configs', 'ESAM_CA', 'ESAM_sv_scannet200_CA_dino.py')
cfg = Config.fromfile(cfg_path)
print('Loaded config from', os.path.abspath(cfg_path))

# 简要查看 pipeline
print('\n[train_pipeline]')
for t in cfg.train_dataloader.dataset.pipeline:
    print(' ', t.get('type', 'Unknown'), t)


# pipeline sanity checks
pipeline_types = [t.get('type', 'Unknown') for t in cfg.train_dataloader.dataset.pipeline if isinstance(t, dict)]
print('
[pipeline sanity] contains BGR2RGBImg:', 'BGR2RGBImg' in pipeline_types)
if 'BGR2RGBImg' in pipeline_types:
    print('  BGR2RGBImg index:', pipeline_types.index('BGR2RGBImg'))
print('[pipeline sanity] contains ResizeForDINO:', 'ResizeForDINO' in pipeline_types)
print('[pipeline sanity] contains ColorJitterImg:', 'ColorJitterImg' in pipeline_types)




Loaded config from /home/nebula/xxy/3D_Reconstruction/configs/ESAM_CA/ESAM_sv_scannet200_CA_dino.py

[train_pipeline]
  LoadPointsFromFile {'type': 'LoadPointsFromFile', 'coord_type': 'DEPTH', 'shift_height': False, 'use_color': True, 'load_dim': 6, 'use_dim': [0, 1, 2, 3, 4, 5]}
  LoadAnnotations3D_ {'type': 'LoadAnnotations3D_', 'with_bbox_3d': False, 'with_label_3d': False, 'with_mask_3d': True, 'with_seg_3d': True, 'with_sp_mask_3d': True}
  SwapChairAndFloor {'type': 'SwapChairAndFloor'}
  PointSegClassMapping {'type': 'PointSegClassMapping'}
  LoadSingleImageFromFile {'type': 'LoadSingleImageFromFile', 'dataset_type': 'scannet200'}
  RandomFlip3D {'type': 'RandomFlip3D', 'sync_2d': True, 'flip_ratio_bev_horizontal': 0.5, 'flip_ratio_bev_vertical': 0.0}
  GlobalRotScaleTrans {'type': 'GlobalRotScaleTrans', 'rot_range': [-3.14, 3.14], 'scale_ratio_range': [0.8, 1.2], 'translation_std': [0.1, 0.1, 0.1], 'shift_height': False}
  NormalizePointsColor_ {'type': 'NormalizePointsColor_',

  def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):


In [3]:
# 2. 构建训练数据集与 DataLoader，并抓取一个 batch 做形状检查（支持 batch>1）
train_dataset = DATASETS.build(cfg.train_dataloader.dataset)
print('Train dataset length =', len(train_dataset))

# 这里可以自由调大 batch_size 以测试 batch 对齐
BATCH_SIZE = 2
sampler = DefaultSampler(train_dataset, shuffle=False)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    collate_fn=default_collate,
    num_workers=0,
)

batch = next(iter(train_loader))
print('\n[Raw batch keys]:', batch.keys())
inputs = batch['inputs']
data_samples = batch['data_samples']
print('[inputs keys]:', inputs.keys())
print('batch_size =', BATCH_SIZE)

# points: 可能是 list[Tensor] 或直接 Tensor（取决于 collate）
pts_raw = inputs.get('points', None)
if isinstance(pts_raw, list):
    print('points list len:', len(pts_raw), 'first shape:', pts_raw[0].shape)
else:
    print('points type:', type(pts_raw), 'shape:', getattr(pts_raw, 'shape', None))

# img: 通常会堆叠为 (B,3,420,560)
img_raw = inputs.get('img', None)
if img_raw is None:
    print('img not found in inputs')
else:
    print('img type:', type(img_raw), 'shape:', getattr(img_raw, 'shape', None))

# cam_info: raw 结构可能是 list[dict] 或被 batch 成单个 dict
cam_raw = inputs.get('cam_info', None)
print('\n[cam_info raw]')
print('cam_info type:', type(cam_raw))
if isinstance(cam_raw, list):
    print('cam_info len:', len(cam_raw))
print(cam_raw)

# raw data_samples 的 flow 预览（增强信息在 img_metas 中）
print('\n[raw data_samples meta summary]')
for i, ds in enumerate(data_samples):
    meta = getattr(ds, 'img_metas', None)
    if not isinstance(meta, dict):
        meta = ds.metainfo if hasattr(ds, 'metainfo') else {}
    flow = meta.get('transformation_3d_flow', None)
    print(f' sample {i}: has_flow={flow is not None}, flow={flow}')


Train dataset length = 10001
[LoadCamInfo] 使用固定标准内参: [577.870605, 577.870605, 319.5, 239.5] (ScanNet官方策略)
[Pack3DDetInputs_] keys=['points', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask', 'sp_pts_mask', 'gt_sp_masks', 'elastic_coords', 'img', 'cam_info'], inputs_keys=['points', 'elastic_coords', 'img', 'cam_info']
[Pack3DDetInputs_] keys=['points', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask', 'sp_pts_mask', 'gt_sp_masks', 'elastic_coords', 'img', 'cam_info'], inputs_keys=['points', 'elastic_coords', 'img', 'cam_info']

[Raw batch keys]: dict_keys(['data_samples', 'inputs'])
[inputs keys]: dict_keys(['points', 'elastic_coords', 'img', 'cam_info'])
batch_size = 2
points type: <class 'torch.Tensor'> shape: torch.Size([2, 20000, 6])
img type: <class 'torch.Tensor'> shape: torch.Size([2, 3, 420, 560])

[cam_info raw]
cam_info type: <class 'list'>
cam_info len: 1
[{'intrinsics': [tensor([505.6368, 505.6368], dtype=torch.float64), tensor([505.6368, 505.6368], dtype=tor



In [4]:
# 3. 构建 ScanNet200MixFormer3D 模型并加载预训练权重（如果 cfg.load_from 存在）
model = MODELS.build(cfg.model)
print('Model class:', type(model))
print('Backbone:', type(model.backbone))
if hasattr(model, 'dino'):
    print('DINO module:', type(model.dino))
else:
    print('No DINO module found on model')

import inspect
import oneformer3d.mixformer3d as mixformer3d_mod
print('oneformer3d.mixformer3d file =', mixformer3d_mod.__file__)
print('signature _build_dino_fpn_online:', inspect.signature(model._build_dino_fpn_online))

if cfg.get('load_from', None) is not None:
    ckpt_path = cfg.load_from
    print('Loading checkpoint from', ckpt_path)
    _ = load_checkpoint(model, ckpt_path, map_location='cpu')
else:
    print('cfg.load_from is None, using randomly initialized weights')

# 移动到 GPU（如果可用）
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()
print('Using device:', device)


2025-12-12 16:34:15,462 - INFO - 从本地 DINOv2 repo 加载: /home/nebula/.cache/torch/hub/facebookresearch_dinov2_main


2025-12-12 16:34:15,467 - INFO - using MLP layer as FFN
2025-12-12 16:34:17,746 - INFO - ✓ 从本地权重加载: /home/nebula/xxy/dataset/models/dinov2_vitl14_reg4_pretrain.pth
2025-12-12 16:34:17,851 - INFO - ✓ 成功加载 dinov2_vitl14_reg


Model class: <class 'oneformer3d.mixformer3d.ScanNet200MixFormer3D'>
Backbone: <class 'oneformer3d.mink_unet.Res16UNet34C'>
DINO module: <class 'oneformer3d.dino_backbone.DINOv2Backbone'>
Loading checkpoint from /home/nebula/xxy/3D_Reconstruction/work_dirs/ESAM_sv_scannet200_CA/best_all_ap_50%_epoch_128.pth
Loads checkpoint by local backend from path: /home/nebula/xxy/3D_Reconstruction/work_dirs/ESAM_sv_scannet200_CA/best_all_ap_50%_epoch_128.pth
The model and loaded state dict do not match exactly

missing keys in source state_dict: backbone.dino_proj_8.kernel, backbone.dino_proj_4.kernel, backbone.dino_proj_2.kernel, backbone.dino_proj_1.kernel, dino.model.cls_token, dino.model.pos_embed, dino.model.register_tokens, dino.model.mask_token, dino.model.patch_embed.proj.weight, dino.model.patch_embed.proj.bias, dino.model.blocks.0.norm1.weight, dino.model.blocks.0.norm1.bias, dino.model.blocks.0.attn.qkv.weight, dino.model.blocks.0.attn.qkv.bias, dino.model.blocks.0.attn.proj.weight, din

In [5]:
# 4. 通过 Det3DDataPreprocessor_ 处理 batch，然后调用 extract_feat，检查 DINO FPN（支持 batch>1）
with torch.no_grad():
    # data_preprocessor 期望的输入是 dict，已经包含 'inputs' 与 'data_samples'
    data_batch = {'inputs': batch['inputs'], 'data_samples': batch['data_samples']}
    processed = model.data_preprocessor(data_batch, training=False)
    batch_inputs = processed['inputs']
    batch_samples = processed['data_samples']

    print('[After data_preprocessor] inputs keys:', batch_inputs.keys())
    if 'img' in batch_inputs:
        print('  img tensor shape:', batch_inputs['img'].shape, 'device:', batch_inputs['img'].device)
    if 'cam_info' in batch_inputs:
        cam_raw = batch_inputs['cam_info']
        print('  cam_info type:', type(cam_raw), 'len:', len(cam_raw) if isinstance(cam_raw, list) else None)

    # ---- 统一搬运到 model device，避免 CPU/GPU 混用 ----
    device = next(model.parameters()).device
    def _to_dev(x):
        return x.to(device) if torch.is_tensor(x) else x

    pts = batch_inputs.get('points', None)
    if isinstance(pts, list):
        batch_inputs['points'] = [p.to(device) for p in pts]
    elif torch.is_tensor(pts):
        batch_inputs['points'] = pts.to(device)

    if 'img' in batch_inputs and torch.is_tensor(batch_inputs['img']):
        batch_inputs['img'] = batch_inputs['img'].to(device)

    if 'elastic_coords' in batch_inputs:
        ec = batch_inputs['elastic_coords']
        if isinstance(ec, list):
            new_ec = []
            for e in ec:
                if torch.is_tensor(e):
                    new_ec.append(e.to(device))
                else:
                    new_ec.append(torch.as_tensor(e, device=device, dtype=torch.float32))
            batch_inputs['elastic_coords'] = new_ec
        else:
            batch_inputs['elastic_coords'] = torch.as_tensor(ec, device=device, dtype=torch.float32)

    for k in ['clip_pix', 'clip_global', 'dino_point_feats']:
        if k in batch_inputs:
            v = batch_inputs[k]
            if isinstance(v, list):
                batch_inputs[k] = [_to_dev(t) for t in v]
            elif torch.is_tensor(v):
                batch_inputs[k] = v.to(device)

    # ---- 打印每个样本的 3D 增强命中情况（HF/R/S/T） ----
    print('\n[3D aug flow per sample]')
    for i, sample in enumerate(batch_samples):
        meta = getattr(sample, 'img_metas', None)
        if not isinstance(meta, dict):
            meta = sample.metainfo if hasattr(sample, 'metainfo') else {}
        flow = meta.get('transformation_3d_flow', []) or []
        hits = {k: (k in flow) for k in ['HF', 'R', 'S', 'T', 'VF']}
        print(
            f' sample {i}: flow={flow}, hits={hits}, '
            f'hflip={meta.get("pcd_horizontal_flip")}, '
            f'rot_angle={meta.get("pcd_rotation_angle")}, '
            f'scale={meta.get("pcd_scale_factor")}, '
            f'trans={meta.get("pcd_trans")}'
        )
    print('using device for forward:', device)

    # 调用 extract_feat，内部会触发在线 DINO + FPN 构建
    features, point_features, all_xyz_w = model.extract_feat(batch_inputs, batch_samples)
    print('\n[extract_feat outputs]')
    print('  #superpoint feature tensors:', len(features))
    print('  first sp_feat shape:', features[0].shape if len(features) > 0 else None)
    print('  #point feature tensors:', len(point_features))
    print('  first point_feat shape:', point_features[0].shape if len(point_features) > 0 else None)
    print('  all_xyz_w shape:', all_xyz_w.shape if hasattr(all_xyz_w, 'shape') else type(all_xyz_w))

    # 额外输出：DINO 特征图尺寸与 2D–3D valid rate
    if getattr(model, 'dino', None) is not None:
        last_shape = getattr(model.dino, '_last_feat_shape', None)
        if last_shape is not None:
            print('\n[DINO debug] last DINO feat shape (B,C,H_p,W_p):', last_shape)

    valid_rate = getattr(model, '_last_dino_valid_rate', None)
    per_sample = getattr(model, '_last_dino_valid_rate_per_sample', None)
    if valid_rate is not None:
        print('[DINO debug] valid_rate (all points): {:.4f}'.format(valid_rate))
        if isinstance(per_sample, (list, tuple)):
            print('  per-sample valid rates:', ['{:.4f}'.format(r) for r in per_sample])
    else:
        print('[DINO debug] valid_rate not available')

    coords_src = getattr(model, '_last_dino_coords_source', None)
    if coords_src is not None:
        print('[DINO debug] dino coords source (per-sample):', coords_src)
    else:
        print('[DINO debug] dino coords source not available')


print('\n如果上述 cell 没有报错，并能看到 [DINO] 日志、DINO 特征图尺寸与合理的 valid rate，则在线 DINO + 2D–3D 对齐路径基本工作正常。')



[After data_preprocessor] inputs keys: dict_keys(['points', 'elastic_coords', 'img', 'cam_info'])
  img tensor shape: torch.Size([2, 3, 420, 560]) device: cuda:0
  cam_info type: <class 'list'> len: 1

[3D aug flow per sample]
 sample 0: flow=['R', 'S', 'T'], hits={'HF': False, 'R': True, 'S': True, 'T': True, 'VF': False}, hflip=False, rot_angle=0.30148888902291926, scale=1.1061340855918897, trans=[    0.13457     0.11183   -0.017188]
 sample 1: flow=['HF', 'R', 'S', 'T'], hits={'HF': True, 'R': True, 'S': True, 'T': True, 'VF': False}, hflip=True, rot_angle=1.676274561710661, scale=1.002212835630251, trans=[   0.020099  0.00077867   -0.013422]
using device for forward: cuda:0
[DINO][debug] keys={'dino_fpn': False, 'dino_feats': False, 'dino_point_feats': False, 'clip_pix': False, 'img': True, 'cam_info': True}, cam_info_type=<class 'list'>, cam_len=1
[DINO] build from online DINO backbone, shapes=[torch.Size([23351, 1024]), torch.Size([9744, 1024]), torch.Size([2919, 1024]), torch.Si