In [10]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images

device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)

image_names = [
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0001.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0002.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0003.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0004.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0005.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0006.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0007.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0008.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0009.png",
    "/mnt/persistent-data/calib_challenge/vggt/1/frames/frame_0010.png",
]

images = load_and_preprocess_images(image_names).to(device)

with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        # Predict attributes including cameras, depth maps, and point maps.
        predictions = model(images)

In [11]:
pose_enc_data = predictions["pose_enc"]
world_points = predictions["world_points"]

In [12]:
import numpy as np
from typing import Any
import torch
from numpy.typing import NDArray
from vggt.utils.pose_enc import pose_encoding_to_extri_intri

def extract_pitch_yaw_radians(
    pose_enc_data: torch.Tensor, 
    image_size_hw: tuple[int, int]
) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
    """
    Extract pitch/yaw in RADIANS using VGGT's native utilities
    
    Args:
        pose_enc_data: VGGT pose encoding tensor
        image_size_hw: Tuple of (height, width) in pixels
        
    Returns:
        Tuple of (pitch_angles, yaw_angles) as numpy arrays in radians
    """
    # Use VGGT's native conversion
    extrinsics, intrinsics = pose_encoding_to_extri_intri(
        pose_enc_data,
        image_size_hw=image_size_hw,
        pose_encoding_type="absT_quaR_FoV"
    )
    
    pitch_angles: list[float] = []
    yaw_angles: list[float] = []
    
    batch_idx: int = 0  
    num_frames: int = extrinsics.shape[1]
    
    for frame_idx in range(num_frames):
        # Get rotation matrix (3x3 part of extrinsic)
        R: NDArray[np.float64] = extrinsics[batch_idx, frame_idx, :3, :3].cpu().numpy()
        
        # Euler angles from rotation matrix (IN RADIANS)
        pitch: float = np.arcsin(-R[2, 0])  # Returns radians
        yaw: float = np.arctan2(R[1, 0], R[0, 0])  # Returns radians
        
        pitch_angles.append(pitch)
        yaw_angles.append(yaw)
    
    return np.array(pitch_angles), np.array(yaw_angles)

def calculate_camera_mounting_from_vp(
    vp_coords: list[float], 
    camera_intrinsics: NDArray[np.float32]
) -> tuple[float, float]:
    """
    Calculate camera mounting angles from vanishing point in RADIANS
    
    Args:
        vp_coords: [x, y] vanishing point coordinates in pixels
        camera_intrinsics: 3x3 camera intrinsic matrix
    
    Returns:
        Tuple of (pitch_offset, yaw_offset) in RADIANS
    """
    # Normalize vanishing point using camera intrinsics
    intrinsics_inv: NDArray[np.float64] = np.linalg.inv(camera_intrinsics)
    vp_homogeneous: NDArray[np.float64] = np.array([vp_coords[0], vp_coords[1], 1.0])
    vp_normalized: NDArray[np.float64] = intrinsics_inv @ vp_homogeneous
    
    # Calculate mounting angles (IN RADIANS)
    yaw_offset: float = np.arctan(vp_normalized[0])
    pitch_offset: float = -np.arctan(vp_normalized[1] * np.cos(yaw_offset))
    
    return pitch_offset, yaw_offset

def find_vanishing_point_from_gt(
    gt_pitch_rad: NDArray[np.float64] | float, 
    gt_yaw_rad: NDArray[np.float64] | float, 
    vggt_pitch_rad: NDArray[np.float64], 
    vggt_yaw_rad: NDArray[np.float64], 
    camera_intrinsics: NDArray[np.float32]
) -> list[float]:
    """
    Reverse-engineer vanishing point from ground truth data
    
    Args:
        gt_pitch_rad: Ground truth pitch in radians (array or single value)
        gt_yaw_rad: Ground truth yaw in radians (array or single value)  
        vggt_pitch_rad: VGGT relative pitch in radians
        vggt_yaw_rad: VGGT relative yaw in radians
        camera_intrinsics: 3x3 camera intrinsic matrix
    
    Returns:
        List of [x, y] vanishing point coordinates in pixels
    """
    # Calculate required offset (use first frame or average)
    if isinstance(gt_pitch_rad, np.ndarray):
        target_pitch_offset: float = gt_pitch_rad[0] - vggt_pitch_rad[0]
        target_yaw_offset: float = gt_yaw_rad[0] - vggt_yaw_rad[0]
    else:
        target_pitch_offset = gt_pitch_rad - vggt_pitch_rad[0]
        target_yaw_offset = gt_yaw_rad - vggt_yaw_rad[0]
    
    # Reverse the vanishing point calculation
    # From: yaw_offset = arctan(vp_norm[0])
    # To: vp_norm[0] = tan(yaw_offset)
    vp_norm_x: float = np.tan(target_yaw_offset)
    
    # From: pitch_offset = -arctan(vp_norm[1] * cos(yaw_offset))
    # To: vp_norm[1] = -tan(pitch_offset) / cos(yaw_offset)
    vp_norm_y: float = -np.tan(target_pitch_offset) / np.cos(target_yaw_offset)
    
    # Convert normalized coordinates back to pixel coordinates
    vp_normalized: NDArray[np.float64] = np.array([vp_norm_x, vp_norm_y, 1.0])
    vp_pixel: NDArray[np.float64] = camera_intrinsics @ vp_normalized
    
    return [float(vp_pixel[0]), float(vp_pixel[1])]

def complete_camera_calibration(
    pose_enc_data: torch.Tensor, 
    image_size_hw: tuple[int, int], 
    vp_coords: list[float] | None = None, 
    gt_data: dict[str, NDArray[np.float64]] | None = None
) -> tuple[NDArray[np.float64], NDArray[np.float64], dict[str, Any]]:
    """
    Complete camera calibration workflow combining VGGT + vanishing point
    
    Args:
        pose_enc_data: VGGT pose encoding output tensor
        image_size_hw: (height, width) tuple
        vp_coords: [x, y] vanishing point coordinates (if known)
        gt_data: Dictionary with 'pitch' and 'yaw' keys containing ground truth data in radians
    
    Returns:
        Tuple of (final_pitch_rad, final_yaw_rad, camera_mounting_info)
    """
    HEIGHT, WIDTH = image_size_hw
    
    # Camera intrinsics based on the comma ai readme
    FOCAL_LENGTH: float = 910.0
    camera_intrinsics: NDArray[np.float32] = np.array([
        [FOCAL_LENGTH, 0, WIDTH/2.0],
        [0, FOCAL_LENGTH, HEIGHT/2.0],
        [0, 0, 1]
    ], dtype=np.float32)
    
    # Extract VGGT relative motion
    vggt_pitch_rad, vggt_yaw_rad = extract_pitch_yaw_radians(pose_enc_data, image_size_hw)
    
    print("=== VGGT Relative Motion (radians) ===")
    print(f"Pitch range: {vggt_pitch_rad.min():.6f} to {vggt_pitch_rad.max():.6f}")
    print(f"Yaw range: {vggt_yaw_rad.min():.6f} to {vggt_yaw_rad.max():.6f}")
    print(f"First frame: Pitch={vggt_pitch_rad[0]:.6f}, Yaw={vggt_yaw_rad[0]:.6f}")
    
    # Determine vanishing point
    if vp_coords is None and gt_data is not None:
        print("\n=== Estimating Vanishing Point from Ground Truth ===")
        vp_coords = find_vanishing_point_from_gt(
            gt_data['pitch'], gt_data['yaw'], 
            vggt_pitch_rad, vggt_yaw_rad, 
            camera_intrinsics
        )
        print(f"Estimated vanishing point: [{vp_coords[0]:.2f}, {vp_coords[1]:.2f}]")
    
    # Calculate camera mounting offset
    if vp_coords is None:
        raise ValueError("Either vp_coords or gt_data must be provided")
        
    pitch_offset_rad, yaw_offset_rad = calculate_camera_mounting_from_vp(vp_coords, camera_intrinsics)
    
    print(f"\n=== Camera Mounting Angles (radians) ===")
    print(f"Pitch offset: {pitch_offset_rad:.6f} rad ({np.degrees(pitch_offset_rad):.3f}°)")
    print(f"Yaw offset: {yaw_offset_rad:.6f} rad ({np.degrees(yaw_offset_rad):.3f}°)")
    
    # Combine mounting + relative motion
    final_pitch_rad: NDArray[np.float64] = vggt_pitch_rad + pitch_offset_rad
    final_yaw_rad: NDArray[np.float64] = vggt_yaw_rad + yaw_offset_rad
    
    print(f"\n=== Final Camera Pose (radians) ===")
    print(f"Pitch: {final_pitch_rad}")
    print(f"Yaw: {final_yaw_rad}")
    print(f"\nFirst frame final: Pitch={final_pitch_rad[0]:.6f}, Yaw={final_yaw_rad[0]:.6f}")
    
    mounting_info: dict[str, Any] = {
        'vanishing_point': vp_coords,
        'pitch_offset_rad': pitch_offset_rad,
        'yaw_offset_rad': yaw_offset_rad,
        'pitch_offset_deg': np.degrees(pitch_offset_rad),
        'yaw_offset_deg': np.degrees(yaw_offset_rad),
        'camera_intrinsics': camera_intrinsics
    }
    
    return final_pitch_rad, final_yaw_rad, mounting_info

def run_calibration_example(
    pose_enc_data: torch.Tensor
) -> tuple[NDArray[np.float64], NDArray[np.float64], dict[str, Any]]:
    """
    Example usage with your VGGT pose_enc_data
    
    Args:
        pose_enc_data: VGGT pose encoding tensor
        
    Returns:
        Tuple of (final_pitch, final_yaw, mounting_info)
    """
    HEIGHT: int = 874
    WIDTH: int = 1164
    FOCAL_LENGTH: float = 910.0 # TODO: use later properly
    
    gt_pitch_rad: NDArray[np.float64] = np.array([
        3.346066188150387949e-02, 3.332004697594769665e-02, 3.326381557788743448e-02,
        3.328000156359077477e-02, 3.333414362855208202e-02, 3.341528307294515387e-02,
        3.344653297968317590e-02, 3.328151290750939323e-02, 3.323636234017846025e-02,
        3.322040813271374959e-02
    ])

    gt_yaw_rad: NDArray[np.float64] = np.array([
        3.149205029088487234e-02, 3.131719816086165481e-02, 3.120279874728593833e-02,
        3.122725488847126821e-02, 3.096299378275137182e-02, 3.096141898069740273e-02,
        3.097767903556855607e-02, 3.089276518450867828e-02, 3.106456737090537018e-02,
        3.105538050544190062e-02
    ])
    
    gt_data: dict[str, NDArray[np.float64]] = {'pitch': gt_pitch_rad, 'yaw': gt_yaw_rad}
    
    print("=== Auto-estimate vanishing point ===")
    final_pitch1, final_yaw1, info1 = complete_camera_calibration(
        pose_enc_data, (HEIGHT, WIDTH), gt_data=gt_data
    )
    
    print(f"\n=== Comparison with Ground Truth ===")
    print(f"GT Pitch:    {gt_pitch_rad}")
    print(f"GT Yaw:      {gt_yaw_rad}")
    print(f"Final Pitch: {final_pitch1}")
    print(f"Final Yaw:   {final_yaw1}")
    print(f"Difference Pitch:  {np.abs(gt_pitch_rad - final_pitch1)}")
    print(f"Difference Yaw:    {np.abs(gt_yaw_rad - final_yaw1)}")
    print(f"Max pitch error: {np.max(np.abs(gt_pitch_rad - final_pitch1)):.6f} rad ({np.degrees(np.max(np.abs(gt_pitch_rad - final_pitch1))):.3f}°)")
    print(f"Max yaw error: {np.max(np.abs(gt_yaw_rad - final_yaw1)):.6f} rad ({np.degrees(np.max(np.abs(gt_yaw_rad - final_yaw1))):.3f}°)")
    
    return final_pitch1, final_yaw1, info1

final_pitch, final_yaw, mounting_info = run_calibration_example(pose_enc_data)

=== Auto-estimate vanishing point ===
=== VGGT Relative Motion (radians) ===
Pitch range: -0.023074 to 0.000143
Yaw range: -0.009259 to 0.000724
First frame: Pitch=0.000143, Yaw=0.000382

=== Estimating Vanishing Point from Ground Truth ===
Estimated vanishing point: [610.32, 406.66]

=== Camera Mounting Angles (radians) ===
Pitch offset: 0.033317 rad (1.909°)
Yaw offset: 0.031110 rad (1.782°)

=== Final Camera Pose (radians) ===
Pitch: [0.03346064 0.03250136 0.02902086 0.02677931 0.02598308 0.01893042
 0.01561101 0.01213785 0.01024308 0.01103576]
Yaw: [0.03149206 0.03160299 0.03183388 0.03058257 0.02805724 0.02650303
 0.024784   0.02391155 0.02343872 0.02185073]

First frame final: Pitch=0.033461, Yaw=0.031492

=== Comparison with Ground Truth ===
GT Pitch:    [0.03346066 0.03332005 0.03326382 0.03328    0.03333414 0.03341528
 0.03344653 0.03328151 0.03323636 0.03322041]
GT Yaw:      [0.03149205 0.0313172  0.0312028  0.03122725 0.03096299 0.03096142
 0.03097768 0.03089277 0.03106457 0

In [9]:
# Source links: 
# https://github.com/commaai/openpilot/blob/c460f5150f961ef77b8057c1fe9532086b9768dd/common/transformations/camera.py#L99
# https://thomasfermi.github.io/Algorithms-for-Automated-Driving/CameraCalibration/VanishingPointCameraCalibration.html?highlight=vanishing%20point#vanishing-point-method


def get_vanishing_point_simple(
    pose_enc_data: torch.Tensor, 
    image_size_hw: tuple[int, int]
) -> torch.Tensor:
    """
    Simple function to get forward direction vanishing point (lane lines).
    Returns [u, v] coordinates.
    
    Args:
        pose_enc_data: VGGT pose encoding tensor
        image_size_hw: Tuple of (height, width) in pixels
        
    Returns:
        Vanishing points tensor of shape [B, S, 2] where B=batch, S=sequence
    """
    extrinsics, intrinsics = pose_encoding_to_extri_intri(
        pose_enc_data,
        image_size_hw=image_size_hw,
        pose_encoding_type="absT_quaR_FoV",
        build_intrinsics=True
    )
    
    # # Forward direction only (lane lines)
    # forward_dir = torch.tensor([0.0, 0.0, 1.0], device=pose_enc_data.device)
    
    B: int
    S: int
    B, S = extrinsics.shape[:2]
    
    # Extract rotation matrices [B, S, 3, 3]
    R: torch.Tensor = extrinsics[..., :3]
    
    # Transform direction: R * [0, 0, 1] 
    rotated_dir: torch.Tensor = R[..., :, 2]  # [B, S, 3] 
    
    # Project: K * rotated_dir
    projected: torch.Tensor = torch.bmm(
        intrinsics.view(-1, 3, 3),
        rotated_dir.view(-1, 3, 1)
    ).squeeze(-1).view(B, S, 3)
    
    # Convert to pixel coordinates [B, S, 2]
    vp: torch.Tensor = projected[..., :2] / projected[..., 2:3]
    
    return vp


# Usage example with types
vanishing_points: torch.Tensor = get_vanishing_point_simple(pose_enc_data, (874, 1164))

gt_vp: NDArray[np.float32] = vanishing_points[0, 0].cpu().numpy()
print(f"gt_vp = [{gt_vp[0]:.2f}, {gt_vp[1]:.2f}]")

gt_vp_avg: NDArray[np.float32] = vanishing_points[0].mean(dim=0).cpu().numpy()
print(f"gt_vp (avg) = [{gt_vp_avg[0]:.2f}, {gt_vp_avg[1]:.2f}]")

print("\nAll frames:")
for i in range(vanishing_points.shape[1]):
    vp: NDArray[np.float32] = vanishing_points[0, i].cpu().numpy()
    print(f"Frame {i}: [{vp[0]:.2f}, {vp[1]:.2f}]")

gt_vp = [582.10, 436.96]
gt_vp (avg) = [573.37, 439.37]

All frames:
Frame 0: [582.10, 436.96]
Frame 1: [581.43, 438.53]
Frame 2: [578.98, 438.86]
Frame 3: [577.40, 439.28]
Frame 4: [576.81, 439.35]
Frame 5: [571.55, 439.85]
Frame 6: [569.12, 438.88]
Frame 7: [566.30, 440.06]
Frame 8: [564.67, 440.72]
Frame 9: [565.32, 441.26]
