In [1]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images

device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

# Read images from 'people/images' directory
# image_dir = 'examples/room/images'
image_dir = 'people/images'
image_names = [os.path.join(image_dir, img) for img in os.listdir(image_dir) if img.endswith(('.png', '.jpg', '.jpeg'))]
print(image_names)

['people/images/WechatIMG199.jpg', 'people/images/WechatIMG200.jpg', 'people/images/WechatIMG201.jpg', 'people/images/WechatIMG202.jpg', 'people/images/WechatIMG203.jpg']


In [3]:
# Load and preprocess example images (replace with your own image paths)
images = load_and_preprocess_images(image_names, mode='pad').to(device)

with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        # Predict attributes including cameras, depth maps, and point maps.
        predictions = model(images)

In [4]:
predictions

{'pose_enc': tensor([[[-8.2492e-05,  5.6599e-05,  8.8434e-05,  2.2337e-05, -1.1371e-05,
           -2.7157e-05,  1.0003e+00,  1.1167e+00,  1.1153e+00],
          [-1.4083e-01, -3.9391e-01,  1.5012e+00,  1.7941e-02,  9.7272e-01,
            2.1457e-01,  7.4722e-02,  1.1206e+00,  1.1226e+00],
          [ 7.5284e-01, -3.5450e-03,  2.7396e-01, -4.5716e-02, -7.2107e-01,
           -1.2707e-01,  6.7989e-01,  1.0209e+00,  1.0188e+00],
          [ 3.0072e-01,  1.5258e-01, -5.2038e-01,  4.6226e-02, -1.7206e-01,
           -4.5178e-02,  9.8273e-01,  1.0120e+00,  1.0110e+00],
          [-5.8533e-01,  1.9084e-01, -3.9816e-01,  2.2616e-01,  3.1385e-01,
            6.8584e-02,  9.1971e-01,  1.1021e+00,  1.0964e+00]]],
        device='cuda:0'),
 'depth': tensor([[[[[1.0975],
            [1.1496],
            [1.1598],
            ...,
            [1.0884],
            [1.0761],
            [1.0391]],
 
           [[1.1230],
            [1.1316],
            [1.1436],
            ...,
            [1.0

In [5]:
from PIL import Image

# Get the sizes of the images
image_sizes = []
for image_path in image_names:
    with Image.open(image_path) as img:
        image_sizes.append(img.size)  # (width, height)

print(image_sizes)

[(1278, 2572), (1280, 1707), (1280, 1707), (1280, 1707), (1280, 1707)]


In [6]:
from vggt.utils.pose_enc import pose_encoding_to_extri_intri

num = 0
# pose_enc: tensor of shape (B, S, 9)
extrinsics, intrinsics = pose_encoding_to_extri_intri(
    predictions['pose_enc'][..., num:num+1, :], image_size_hw=image_sizes[num], build_intrinsics=True
)
# extrinsics: (B, S, 3, 4)
# intrinsics: (B, S, 3, 3)

In [7]:
predictions['pose_enc'][..., num:num+1, :].shape

torch.Size([1, 1, 9])

In [8]:
extrinsics

tensor([[[[ 1.0000e+00,  5.4299e-05, -2.2738e-05, -8.2492e-05],
          [-5.4300e-05,  1.0000e+00, -4.4661e-05,  5.6599e-05],
          [ 2.2735e-05,  4.4662e-05,  1.0000e+00,  8.8434e-05]]]],
       device='cuda:0')

In [9]:
intrinsics

tensor([[[[2.0619e+03, 0.0000e+00, 1.2860e+03],
          [0.0000e+00, 1.0230e+03, 6.3900e+02],
          [0.0000e+00, 0.0000e+00, 1.0000e+00]]]], device='cuda:0')

In [10]:
# Create a dictionary mapping each image to its corresponding extrinsics and intrinsics
camera_params = {}
for idx, image_path in enumerate(image_names):
    extrinsics, intrinsics = pose_encoding_to_extri_intri(
        predictions['pose_enc'][..., idx:idx+1, :], image_size_hw=image_sizes[idx], build_intrinsics=True
    )
    camera_params[image_path] = {
        'extrinsics': extrinsics.cpu().numpy(),
        'intrinsics': intrinsics.cpu().numpy()
    }

# Print the camera parameters for each image
for image_path, params in camera_params.items():
    print(f"Image: {image_path}")
    print(f"Extrinsics:\n{params['extrinsics']}")
    print(f"Intrinsics:\n{params['intrinsics']}\n")

Image: people/images/WechatIMG199.jpg
Extrinsics:
[[[[ 1.0000000e+00  5.4299275e-05 -2.2737599e-05 -8.2491752e-05]
   [-5.4300293e-05  1.0000000e+00 -4.4660996e-05  5.6598939e-05]
   [ 2.2735172e-05  4.4662233e-05  1.0000000e+00  8.8434026e-05]]]]
Intrinsics:
[[[[2.0619067e+03 0.0000000e+00 1.2860000e+03]
   [0.0000000e+00 1.0229753e+03 6.3900000e+02]
   [0.0000000e+00 0.0000000e+00 1.0000000e+00]]]]

Image: people/images/WechatIMG200.jpg
Extrinsics:
[[[[-0.9881675   0.00284141  0.15335205 -0.1408295 ]
   [ 0.06709376  0.9071014   0.41553032 -0.39391387]
   [-0.13792518  0.42090255 -0.89655876  1.501157  ]]]]
Intrinsics:
[[[[1.35744495e+03 0.00000000e+00 8.53500000e+02]
   [0.00000000e+00 1.02008264e+03 6.40000000e+02]
   [0.00000000e+00 0.00000000e+00 1.00000000e+00]]]]

Image: people/images/WechatIMG201.jpg
Extrinsics:
[[[[-0.07172966  0.23861735 -0.968461    0.752839  ]
   [-0.10681566  0.9635405   0.24531636 -0.00354502]
   [ 0.9916881   0.12104323 -0.04362631  0.273964  ]]]]
Intri

In [11]:
import numpy as np

def extrinsic_3x4_to_nerf_transform(extrinsic_3x4: np.ndarray) -> np.ndarray:
    """
    将 3x4 外参矩阵（world→camera）转换为 NeRF 使用的 4x4 变换矩阵（camera→world）。
    
    参数
    ----
    extrinsic_3x4 : np.ndarray
        形状为 (3,4) 的外参矩阵 [R | t]，将世界坐标映射到相机坐标。

    返回
    ----
    transform_4x4 : np.ndarray
        形状为 (4,4) 的变换矩阵，将相机坐标映射到世界坐标，可直接填入 NeRF 的 transforms.json 中。
    """
    assert extrinsic_3x4.shape == (3, 4), "输入必须是 3×4 矩阵"
    
    # 拆分旋转和平移
    R = extrinsic_3x4[:, :3]   # 3×3
    t = extrinsic_3x4[:, 3:]   # 3×1

    # world→camera: X_cam = R·X_world + t
    # 我们需要 camera→world: X_world = Rᵀ·(X_cam − t) = Rᵀ·X_cam + (−Rᵀ·t)
    R_c2w = R.T
    t_c2w = -R.T @ t

    # 构造 4×4
    transform_4x4 = np.eye(4, dtype=extrinsic_3x4.dtype)
    transform_4x4[:3, :3] = R_c2w
    transform_4x4[:3, 3  ] = t_c2w.flatten()

    F = np.diag([1, -1, -1, 1])
    transform_4x4 = F @ transform_4x4 @ F # F @ T @ F-1

    return transform_4x4

In [12]:
import os

# Create output directory if it doesn't exist
output_dir = "nerfstudio_format_output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [13]:

import json
import numpy as np

frames = []
count = 0
for image_path, params in camera_params.items():
    extrinsics = params['extrinsics'].squeeze(0).squeeze(0)  # (3, 4)
    intrinsics = params['intrinsics'].squeeze(0).squeeze(0)

    # 提取内参参数
    fl_x = intrinsics[0, 0]
    fl_y = intrinsics[1, 1]
    cx = intrinsics[1, 2]
    cy = intrinsics[0, 2]
    w = image_sizes[count][0]  # 如果已知图像宽度
    h = image_sizes[count][1]  # 如果已知图像高度

    # 构建 4x4 的 transform_matrix
    transform_matrix = extrinsic_3x4_to_nerf_transform(extrinsics)

    frame = {
        'file_path': image_path,
        'fl_x': fl_x,
        'fl_y': fl_y,
        'cx': cx,
        'cy': cy,
        'w': w,
        'h': h,
        'transform_matrix': transform_matrix.tolist()
    }
    frames.append(frame)

    count += 1

# 构建最终的 transforms.json 结构
transforms = {
    'camera_model': 'OPENCV',  # 或 'OPENCV_FISHEYE'，根据您的相机模型选择
    'frames': frames
}

# 将结果写入 transforms.json 文件
with open(f'{output_dir}/transforms.json', 'w') as f:
    # Convert numpy.float32 to Python float for JSON serialization
    json.dump(transforms, f, indent=4, default=lambda o: float(o) if isinstance(o, np.float32) else o)

In [14]:
import numpy as np
from plyfile import PlyData, PlyElement

# Get point clouds and their confidence values from predictions
point_clouds = predictions['world_points'][0].cpu().numpy()  # Remove batch dimension
confidence = predictions['world_points_conf'][0].cpu().numpy()  # Remove batch dimension
# normalize confidence to [0, 1]
confidence = (confidence - confidence.min()) / (confidence.max() - confidence.min())
images = predictions['images'][0].cpu().numpy()  # Get images for color information

# Reshape the arrays
points = point_clouds.reshape(-1, 3)  # Reshape to (N, 3)
conf = confidence.reshape(-1)  # Reshape to (N,)
colors = images.transpose(0, 3, 1, 2).reshape(-1, 3)  # Reshape to (N, 3)

# Filter points with confidence > 0.7
mask = conf > 0.3
filtered_points = points[mask]
filtered_colors = colors[mask]

# transform the points coordinates to nerf system
F = np.diag([1, -1, -1])
filtered_points = F @ filtered_points.T
filtered_points = filtered_points.T

# Convert colors from [0,1] to [0,255] range and ensure they're uint8
filtered_colors = (filtered_colors * 255).astype(np.uint8)

# Create vertex array with dtype for PLY format (including color)
vertex = np.array([(p[0], p[1], p[2], c[0], c[1], c[2]) 
                  for p, c in zip(filtered_points, filtered_colors)],
                  dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
                        ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')])

# Create PlyElement
vertex_element = PlyElement.describe(vertex, 'vertex')

# Create and save PLY file
PlyData([vertex_element]).write(f'{output_dir}/point_cloud_with_color.ply')

print(f"Saved {len(filtered_points)} points with confidence > 0.7 to point_cloud_with_color.ply")

Saved 167033 points with confidence > 0.7 to point_cloud_with_color.ply
