# VDPM Point Cloud Generation

**使用方法:**
1. 确保选择 GPU: Runtime → Change runtime type → GPU (T4)
2. 运行 Setup cell，等待自动重启
3. 重启后，从 Section 2 开始运行

In [None]:
#@title 1. Setup (自动重启)
import os

SETUP_FLAG = '/content/.vdpm_ready_v2'

if os.path.exists(SETUP_FLAG):
    print("✓ 已完成安装，请从 Section 2 继续运行")
else:
    print("=" * 50)
    print("Step 1: Clone VDPM")
    print("=" * 50)
    !rm -rf /content/vdpm
    !git clone --depth 1 https://github.com/eldar/vdpm.git /content/vdpm
    
    print("\n" + "=" * 50)
    print("Step 2: Fix NumPy")
    print("=" * 50)
    !pip uninstall -y numpy
    !pip install numpy==1.26.4
    
    print("\n" + "=" * 50)
    print("Step 3: Install VGGT")
    print("=" * 50)
    !pip install git+https://github.com/facebookresearch/vggt.git@44b3afb
    
    print("\n" + "=" * 50)
    print("Step 4: Install other deps")
    print("=" * 50)
    !pip install roma omegaconf einops jaxtyping
    
    # 验证安装
    print("\n" + "=" * 50)
    print("Verifying installation...")
    print("=" * 50)
    !python -c "import vggt; print('vggt OK')"
    !python -c "import omegaconf; print('omegaconf OK')"
    
    # 标记完成
    !touch {SETUP_FLAG}
    
    print("\n" + "=" * 50)
    print("✓ 安装完成！正在重启...")
    print("重启后请从 Section 2 继续")
    print("=" * 50)
    
    # 强制重启
    os._exit(0)

---
## 2. 加载模型 (重启后从这里开始)

In [None]:
import os, sys
os.chdir('/content/vdpm')
sys.path.insert(0, '/content/vdpm')

import torch
import numpy as np

print(f"PyTorch: {torch.__version__}")
print(f"NumPy: {np.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

# 验证 vggt
import vggt
print("vggt: OK")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from omegaconf import OmegaConf
from dpm.model import VDPM

# 配置
cfg = OmegaConf.create({
    'model': {'name': 'dpm-video', 'pretrained': None, 'decoder_depth': 4}
})

# 加载模型
print("加载模型中...")
model = VDPM(cfg).to(device)

# 下载权重
url = "https://huggingface.co/edgarsucar/vdpm/resolve/main/model.pt"
weights = torch.hub.load_state_dict_from_url(url, file_name="vdpm_model.pt")
model.load_state_dict(weights, strict=True)
model.eval()

print("✓ 模型加载完成")

## 3. 定义函数

In [None]:
import cv2
import tempfile
from pathlib import Path
from vggt.utils.load_fn import load_and_preprocess_images

def extract_and_save_frames(video_path, output_dir, sample_hz=1.0):
    """从视频提取帧并保存为图片"""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    interval = max(int(fps / sample_hz), 1)
    
    paths = []
    count = 0
    frame_idx = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if count % interval == 0:
            path = output_dir / f"{frame_idx:04d}.png"
            cv2.imwrite(str(path), frame)
            paths.append(str(path))
            frame_idx += 1
        count += 1
    
    cap.release()
    print(f"提取了 {len(paths)} 帧")
    return sorted(paths)


def run_vdpm(video_path, output_dir, ref_frame=0):
    """运行 VDPM 生成点云
    
    Args:
        video_path: 视频路径
        output_dir: 输出目录
        ref_frame: 参考帧索引 (点云基于此帧的视角)
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # 提取帧到临时目录
    with tempfile.TemporaryDirectory() as tmp:
        frame_paths = extract_and_save_frames(video_path, tmp)
        
        # 使用官方预处理函数
        images = load_and_preprocess_images(frame_paths).to(device)
        print(f"输入形状: {images.shape}")
    
    # 推理
    print("运行推理...")
    with torch.no_grad():
        result = model.inference(None, images=images.unsqueeze(0))
    
    # 提取点云 (参考 gradio_demo.py 的处理方式)
    # pts3d 形状: [batch, num_target_frames, H, W, 3]
    # conf 形状: [batch, num_target_frames, H, W]
    pointmaps = result['pointmaps']
    
    # 收集所有帧的点云
    pts_list = [pm['pts3d'].detach().cpu().numpy() for pm in pointmaps]
    conf_list = [pm['conf'].detach().cpu().numpy() for pm in pointmaps]
    
    # 拼接: [num_frames, num_target_frames, H, W, 3]
    world_points = np.concatenate(pts_list, axis=0)
    world_conf = np.concatenate(conf_list, axis=0)
    
    print(f"world_points shape: {world_points.shape}")
    print(f"world_conf shape: {world_conf.shape}")
    
    num_frames = world_points.shape[0]
    
    # 使用参考帧的视角，获取每一帧的点云
    # world_points[:, ref_frame, :, :, :] 表示从 ref_frame 视角看的所有帧的点云
    all_pts = []
    all_conf = []
    
    for t in range(num_frames):
        # 从参考帧视角看第 t 帧的点云
        pts = world_points[t, ref_frame, :, :, :].reshape(-1, 3)
        conf = world_conf[t, ref_frame, :, :].reshape(-1)
        
        np.savez(output_dir / f"frame_{t:03d}.npz", points=pts, conf=conf)
        all_pts.append(pts)
        all_conf.append(conf)
        print(f"  帧 {t}: {len(pts)} 个点")
    
    # 保存序列
    np.savez(
        output_dir / "sequence.npz",
        points=np.stack(all_pts),
        conf=np.stack(all_conf)
    )
    
    print(f"\n✓ 保存到 {output_dir}")
    return output_dir

<cell_type>markdown</cell_type>## 4. 处理 VLM4D 测试视频 (10 个 GPT-5 mini 答错的题目)

点云将保存到 `/content/vdpm/vlm4d_pointclouds/{video_name}/sequence.npz`

In [None]:
import requests

# 10 个 VLM4D 视频 URL
VLM4D_VIDEOS = [
    ("baseball", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/baseball.mp4"),
    ("basketball-game", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/basketball-game.mp4"),
    ("bear", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/bear.mp4"),
    ("bike-packing", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/bike-packing.mp4"),
    ("blackswan", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/blackswan.mp4"),
    ("bmx-bumps", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/bmx-bumps.mp4"),
    ("bmx-rider", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/bmx-rider.mp4"),
    ("boat", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/boat.mp4"),
    ("breakdance", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/breakdance.mp4"),
    ("breakdance-flare", "https://huggingface.co/datasets/shijiezhou/VLM4D/resolve/main/videos_real/davis/breakdance-flare.mp4"),
]

# 下载视频
VIDEO_DIR = Path("/content/vdpm/vlm4d_videos")
VIDEO_DIR.mkdir(parents=True, exist_ok=True)

for name, url in VLM4D_VIDEOS:
    video_path = VIDEO_DIR / f"{name}.mp4"
    if video_path.exists():
        print(f"✓ 已存在: {name}.mp4")
    else:
        print(f"下载: {name}.mp4 ...", end=" ")
        resp = requests.get(url)
        video_path.write_bytes(resp.content)
        print("完成")

print(f"\n✓ 所有视频已下载到 {VIDEO_DIR}")

In [None]:
# 批量处理 VLM4D 视频
import time

POINTCLOUD_DIR = Path("/content/vdpm/vlm4d_pointclouds")
POINTCLOUD_DIR.mkdir(parents=True, exist_ok=True)

results = {}

for name, url in VLM4D_VIDEOS:
    video_path = VIDEO_DIR / f"{name}.mp4"
    output_dir = POINTCLOUD_DIR / name

    print(f"\n{'='*50}")
    print(f"处理: {name}")
    print(f"{'='*50}")

    # 检查是否已存在
    if (output_dir / "sequence.npz").exists():
        print(f"✓ 点云已存在，跳过")
        results[name] = "✓ 已存在"
        continue

    start = time.time()
    try:
        run_vdpm(str(video_path), str(output_dir))
        elapsed = time.time() - start
        results[name] = f"✓ 成功 ({elapsed:.1f}s)"
    except Exception as e:
        results[name] = f"✗ 失败: {e}"

    # 清理显存
    torch.cuda.empty_cache()

print(f"\n{'='*50}")
print("处理结果汇总:")
print(f"{'='*50}")
for name, status in results.items():
    print(f"  {name}: {status}")

print(f"\n点云保存位置: {POINTCLOUD_DIR}")

<cell_type>markdown</cell_type>## 5. 检查 VLM4D 点云结果

In [None]:
# 检查 VLM4D 点云结果
print("生成的点云:")
for name, _ in VLM4D_VIDEOS:
    npz_path = POINTCLOUD_DIR / name / "sequence.npz"
    if npz_path.exists():
        data = np.load(npz_path)
        print(f"  ✓ {name}: points={data['points'].shape}, conf={data['conf'].shape}")
    else:
        print(f"  ✗ {name}: 缺失")

<cell_type>markdown</cell_type>## 6. 下载 VLM4D 点云

下载后解压到 `test_vdpm_gpt5mini_vlm4d.ipynb` 所在目录的 `vlm4d_pointclouds/` 文件夹

In [None]:
!cd /content/vdpm && zip -r /content/vlm4d_pointclouds.zip vlm4d_pointclouds/
from google.colab import files
files.download("/content/vlm4d_pointclouds.zip")