In [1]:
import torch
import struct
import numpy as np
import matplotlib.pyplot as plt
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
from torch.cuda.amp import autocast

device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

In [2]:
# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT()
_URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
model.to(device)

VGGT(
  (aggregator): Aggregator(
    (patch_embed): DinoVisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
        (norm): Identity()
      )
      (blocks): ModuleList(
        (0-23): 24 x NestedTensorBlock(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): MemEffAttention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): LayerScale()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (act): GELU(approximate=

In [3]:
SCENE="banana"
SKIP=1

if SCENE=="banana": 
    # Load and preprocess example images (replace with your own image paths)
    image_names = [
        "/home/skhalid/Documents/data/banana/input/frame_00001.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00002.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00003.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00004.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00005.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00006.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00007.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00008.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00009.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00010.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00011.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00012.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00013.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00014.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00015.JPG", 
        "/home/skhalid/Documents/data/banana/input/frame_00016.JPG"
    ]
    ### BANANA
    width = 3008
    height = 2000
    BASE_PATH = "/home/skhalid/Documents/data/banana"
    INTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/cameras.bin"
    EXTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/images.bin"
    PTS_PATH = BASE_PATH+"/sparse/0/points3D.ply"
    PREFIX = "frame_"
    START_ID = 0
    N = 1_000

elif SCENE=="lego": 
    ### LEGO
    image_names = ["/home/skhalid/Documents/data/nerf_synthetic/lego/train/r_"+str(v)+".png" for v in range(0, 99, SKIP)]
    width = 800
    height = 800
    BASE_PATH = "/home/skhalid/Documents/data/nerf_synthetic/lego/"
    INTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/cameras.bin"
    EXTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/images.bin"
    PTS_PATH = BASE_PATH+"/sparse/0/points3D.ply"
    PREFIX = "r_"
    START_ID = 0
    N = 200_000

elif SCENE=="bicycle": 
    ### BICYCLE
    BASE="/home/skhalid/Documents/data/360_v2/bicycle/images_4/_DSC"
    image_names = [BASE+str(v)+".JPG" for v in range(8679, 8873, SKIP)]
    width = 1236
    height = 821    
    BASE_PATH = "/home/skhalid/Documents/data/360_v2/bicycle"
    INTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/cameras.bin"
    EXTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/images.bin"
    PTS_PATH = BASE_PATH+"/sparse/0/points3D.ply"
    PREFIX = "_DSC"
    START_ID = 0
    N = 3_000_000
    # test_cases = ["8679.JPG",
    #               "8687.JPG",
    #               "8695.JPG",
    #               "8703.JPG",
    #               "8711.JPG",
    #               "8719.JPG",
    #               "8727.JPG",
    #               "8735.JPG",
    #               "8744.JPG",
    #               "8752.JPG",
    #               "8760.JPG",
    #               "8768.JPG",
    #               "8776.JPG",
    #               "8784.JPG",
    #               "8792.JPG",
    #               "8800.JPG",
    #               "8808.JPG",
    #               "8816.JPG",
    #               "8824.JPG",
    #               "8832.JPG",
    #               "8840.JPG",
    #               "8848.JPG",
    #               "8856.JPG",
    #               "8864.JPG",
    #               "8872.JPG"]
    # for test_case in test_cases:
    #     image_names.append(BASE+str(test_case))

elif SCENE=="truck": 
    ### BICYCLE
    BASE="/home/skhalid/Documents/data/tandt_db/tandt/truck/images/"
    image_names = [BASE+str(v).zfill(6)+".jpg" for v in range(1, 252, SKIP)]
    width = 1957
    height = 1091    
    BASE_PATH = "/home/skhalid/Documents/data/tandt_db/tandt/truck"
    INTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/cameras.bin"
    EXTRINSICS_BINARY_PATH = BASE_PATH+"/sparse/0/images.bin"
    PTS_PATH = BASE_PATH+"/sparse/0/points3D.ply"
    PREFIX = ""
    START_ID = 0
    N = 200_000
    # test_cases = ["8679.JPG",
    #               "8687.JPG",
    #               "8695.JPG",
    #               "8703.JPG",
    #               "8711.JPG",
    #               "8719.JPG",
    #               "8727.JPG",
    #               "8735.JPG",
    #               "8744.JPG",
    #               "8752.JPG",
    #               "8760.JPG",
    #               "8768.JPG",
    #               "8776.JPG",
    #               "8784.JPG",
    #               "8792.JPG",
    #               "8800.JPG",
    #               "8808.JPG",
    #               "8816.JPG",
    #               "8824.JPG",
    #               "8832.JPG",
    #               "8840.JPG",
    #               "8848.JPG",
    #               "8856.JPG",
    #               "8864.JPG",
    #               "8872.JPG"]
    # for test_case in test_cases:
    #     image_names.append(BASE+str(test_case))


In [4]:
import torch
from tqdm import tqdm

def run_batched_camera_inference(model, image_names, batch_size=8, device='cuda', dtype=torch.float16):
    from vggt.utils.pose_enc import pose_encoding_to_extri_intri
    from vggt.utils.geometry import unproject_depth_map_to_point_map
    # from vggt.utils.io import load_and_preprocess_images

    all_extrinsics = []
    all_intrinsics = []
    all_world_points = []
    depth_maps = []
    depth_conf_maps = []
    batch_tensors = []

    # Batch the rest of the images
    print(f"Processing the rest of {len(image_names)} images in batches of {batch_size}...")
    for i in tqdm(range(0, len(image_names), batch_size)):
        batch_names = image_names[i:i + batch_size]
        batch_tensor = load_and_preprocess_images(batch_names).to(device)

        if i==0:
            first_image = batch_tensor[0]
            print("first_image.shape: {}".format(batch_tensor.shape))
        else:
            # Add the first reference image to this batch as well
            batch_tensor = torch.cat((first_image[None], batch_tensor), dim=0)

        with torch.no_grad(), torch.cuda.amp.autocast(dtype=dtype):
            batch_tensor = batch_tensor[None]  # Add batch dim
            agg_tokens, ps_idx = model.aggregator(batch_tensor)

            pose_enc = model.camera_head(agg_tokens)[-1]
            extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, batch_tensor.shape[-2:])

            depth_map, depth_conf_map = model.depth_head(agg_tokens, batch_tensor, ps_idx)
            
            point_map_unproj = unproject_depth_map_to_point_map(depth_map.squeeze(0), extrinsic.squeeze(0), intrinsic.squeeze(0))
    
            # if i==0:    
            #     print("batch: {} | point_map_unproj.shape: {}".format(i, point_map_unproj.shape))
            # else:
            #     print("batch: {} | point_map_unproj[1:, ...].shape: {}".format(i, point_map_unproj[1:, ...].shape))

            if i==0:
                all_extrinsics.append(extrinsic[0, ...])
                all_intrinsics.append(intrinsic[0, ...])
                all_world_points.append(point_map_unproj)
                depth_maps.append(depth_map[0, ...])
                depth_conf_maps.append(depth_conf_map[0, ...])
                batch_tensors.append(batch_tensor[0, ...])
            else:
                all_extrinsics.append(extrinsic[0, 1:])
                all_intrinsics.append(intrinsic[0, 1:])
                all_world_points.append(point_map_unproj[1:, ...])
                depth_maps.append(depth_map[0, 1:, ...])
                depth_conf_maps.append(depth_conf_map[0, 1:, ...])
                batch_tensors.append(batch_tensor[0, 1:, ...])

            print("extrinsic: {}".format(extrinsic.shape))
            print("intrinsic: {}".format(intrinsic.shape))
            print("point_map_unproj: {}".format(point_map_unproj.shape))
            print("depth_map: {}".format(depth_map.shape))
            print("depth_conf_map: {}".format(depth_conf_map.shape))
            print("batch_tensor: {}".format(batch_tensor.shape))

    # Stack everything
    batch_tensors = torch.cat(batch_tensors)  # [N, 4, 4]
    all_extrinsics = torch.cat(all_extrinsics)  # [N, 4, 4]
    all_intrinsics = torch.cat(all_intrinsics)  # [N, 3, 3]
    all_world_points = np.concatenate(all_world_points)  # [N, H, W, 3]
    depth_maps = torch.cat(depth_maps, dim=0)  # [N, H, W, 3]
    depth_conf_maps = torch.cat(depth_conf_maps, dim=0)  # [N, H, W, 3]

    return {
        "all_extrinsics": all_extrinsics, 
        "all_intrinsics": all_intrinsics, 
        "all_world_points": all_world_points,
        "depth_maps": depth_maps,
        "depth_conf_maps": depth_conf_maps,
        "all_images": batch_tensors
    }

    # # Predict Tracks
    # # choose your own points to track, with shape (N, 2) for one scene
    # query_points = torch.FloatTensor([[100.0, 200.0], 
    #                                     [60.72, 259.94]]).to(device)
    # track_list, vis_score, conf_score = model.track_head(aggregated_tokens_list, images, ps_idx, query_points=query_points[None])

In [5]:
# BATCH_SIZE=3

# res = run_batched_camera_inference(model, image_names, batch_size=BATCH_SIZE)

# all_extrinsics = res["all_extrinsics"]
# all_intrinsics = res["all_intrinsics"]
# all_world_points = res["all_world_points"]

In [6]:
# print(all_world_points.shape)
# print(all_extrinsics.shape)
# print(all_intrinsics.shape)

In [7]:
import torch
import numpy as np
import json
from scipy.spatial.transform import Rotation as R

def pose_tensor_to_matrix(translation, quaternion):
    """Convert translation + quaternion to 4x4 pose matrix."""
    r = R.from_quat(quaternion.cpu().numpy())
    R_mat = r.as_matrix()  # [3, 3]

    pose_matrix = np.eye(4)
    pose_matrix[:3, :3] = R_mat
    pose_matrix[:3, 3] = translation.cpu().numpy()

    return pose_matrix.tolist()

def export_poses_as_json(pred_pose_tensor, output_path, image_folder="images"):
    frames = []

    # If batched, flatten to a list
    poses = pred_pose_tensor.view(-1, pred_pose_tensor.shape[-1])  # [B * N, 10]

    for i, pose in enumerate(poses):
        translation = pose[0:3]
        quaternion = pose[3:7]
        fx = pose[7].item()
        fy = pose[8].item()
        cx = 0.5  # assuming normalized cx/cy; adjust as needed
        cy = 0.5

        transform_matrix = pose_tensor_to_matrix(translation, quaternion)

        frame = {
            "file_path": f"{image_folder}/{i:06d}.png",
            "transform_matrix": transform_matrix,
            "intrinsics": {
                "fx": fx,
                "fy": fy,
                "cx": cx,
                "cy": cy
            }
        }

        print(frame)

        frames.append(frame)

    data = {
        "frames": frames
    }

    with open(output_path, "w") as f:
        json.dump(data, f, indent=4)

    print(f"Saved poses to {output_path}")

In [8]:
def write_intrinsics_binary_from_poses(intrinsics_tensor, width, height, dims, output_path, camera_model="PINHOLE", start_id=1):
    """
    Write camera intrinsics from a tensor of 3x3 matrices.

    Args:
        intrinsics_tensor: Tensor of shape [B, N, 3, 3].
        width: Image width.
        height: Image height.
        output_path: Path to cameras.bin.
        camera_model: Camera model name.
        start_id: Starting camera ID.
    """
    CAMERA_MODEL_IDS = {
        "SIMPLE_PINHOLE": 0,
        "PINHOLE": 1,
        "SIMPLE_RADIAL": 2,
        "RADIAL": 3,
        "OPENCV": 4,
    }
    model_id = CAMERA_MODEL_IDS.get(camera_model.upper(), 1)

    intrinsics_tensor = intrinsics_tensor.view(-1, 3, 3)  # [num_cameras, 3, 3]
    num_cameras = intrinsics_tensor.shape[0]
    w, h = dims


    SCALE = width / w # <---------------------------------------------------------------------------------------FIXME
    # SCALE = 2.0 # <---------------------------------------------------------------------------------------FIXME
    print("width: {} | w: {} | scale: {}".format(width, w, SCALE))


    with open(output_path, "wb") as f:
        f.write(struct.pack("Q", num_cameras))  # Number of cameras

        for i in range(num_cameras):
            intrinsics = intrinsics_tensor[i].cpu().numpy()
            fx = intrinsics[0, 0] * SCALE
            fy = intrinsics[1, 1] * SCALE
            cx = intrinsics[0, 2] * SCALE
            cy = intrinsics[1, 2] * SCALE

            params = [fx, fy, cx, cy]
            camera_id = start_id + i

            # Write: camera_id, model_id, width, height
            f.write(struct.pack("iiQQ", camera_id, model_id, width, height))
            # Write parameters
            f.write(struct.pack("d" * len(params), *params))


In [9]:
# export_poses_as_json(predictions["pose_enc"][0], "output_path", image_folder="images")

In [10]:
def convert_to_colmap_format(fns, extrinsics_tensor, image_name_prefix="_DSC", start_id=8679):
    """
    Convert extrinsics matrices into COLMAP-format dictionaries.

    Args:
        extrinsics_tensor: Tensor of shape [B, N, 3, 4] containing [R|t].
        image_name_prefix: Prefix for image names.
        start_id: Starting ID for images.

    Returns:
        Dictionary of COLMAP-style extrinsics.
    """
    images = {}
    extrinsics_tensor = extrinsics_tensor.view(-1, 3, 4)  # [num_poses, 3, 4]
    translation_scale_factor = 5.0  # Current (probably too small)
    print(extrinsics_tensor)

    for idx, extrinsic in enumerate(extrinsics_tensor, start=start_id):
        # Extract rotation matrix and translation
        R = extrinsic[:, :3].cpu().numpy()
        t = extrinsic[:, 3].cpu().numpy()

        # Convert rotation matrix to quaternion (COLMAP uses [w, x, y, z] format)
        from scipy.spatial.transform import Rotation
        qvec = Rotation.from_matrix(R).as_quat()  # [x, y, z, w]
        qvec_colmap = np.array([qvec[3], qvec[0], qvec[1], qvec[2]])  # Reorder to [w, x, y, z]
        qvec_colmap = qvec_colmap / np.linalg.norm(qvec_colmap)  # Ensure it's normalized

        image_name = f"{image_name_prefix}{idx:05d}.JPG"
        image_name = fns[idx]

        images[idx] = {
            "id": idx,
            "qvec": qvec_colmap,
            "tvec": t,
            "camera_id": 1,
            "name": image_name,
            "xys": np.zeros((0, 2)),
            "point3D_ids": np.array([])
        }

    return images


In [11]:
import struct

def write_extrinsics_binary(images, output_path):
    with open(output_path, "wb") as f:
        # Write number of registered images (uint64)
        f.write(struct.pack("Q", len(images)))

        for image_id, img in images.items():
            # Write: image_id (uint32), qvec (4 doubles), tvec (3 doubles), camera_id (uint32)
            f.write(struct.pack("i", img["id"]))  # IMAGE_ID
            f.write(struct.pack("dddd", *img["qvec"]))  # qw, qx, qy, qz
            f.write(struct.pack("ddd", *img["tvec"]))  # tx, ty, tz
            f.write(struct.pack("i", img["camera_id"]))  # CAMERA_ID

            # Write the image name (null-terminated string)
            f.write(img["name"].encode("utf-8") + b'\x00')

            # No 2D points
            num_points2D = 0
            f.write(struct.pack("Q", num_points2D))

            # (Skip point data)

In [12]:
# b, h, w, c = all_world_points.shape
# images = convert_to_colmap_format(image_names, all_extrinsics, image_name_prefix=PREFIX, start_id=START_ID)
# write_extrinsics_binary(images, EXTRINSICS_BINARY_PATH)
# write_intrinsics_binary_from_poses(all_intrinsics, width, height, (w, h), INTRINSICS_BINARY_PATH)

In [13]:
import torch
import numpy as np
import open3d as o3d
from plyfile import PlyData, PlyElement

def save_point_cloud_ply(filepath, points, colors=None, normals=None):
    """
    Save a point cloud to a PLY file.
    
    Args:
        filepath: Output path.
        points: (N, 3) numpy array of XYZ coordinates.
        colors: (N, 3) numpy array of RGB values in [0, 1] or [0, 255]. Optional.
        normals: (N, 3) numpy array of normals. Optional.
    """

    num_points = points.shape[0]

    # Default colors → white
    if colors is None:
        colors = np.ones_like(points) * 255
    else:
        # If in [0, 1], scale to [0, 255]
        if colors.max() <= 1.0:
            colors = (colors * 255).astype(np.uint8)

    # Default normals → zero
    if normals is None:
        normals = np.zeros_like(points)

    # Define PLY vertex structure
    vertex_data = np.array(
        [
            (
                points[i, 0], points[i, 1], points[i, 2],
                normals[i, 0], normals[i, 1], normals[i, 2],
                colors[i, 0], colors[i, 1], colors[i, 2],
            )
            for i in range(num_points)
        ],
        dtype=[
            ('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
            ('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4'),
            ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'),
        ]
    )

    # Create PlyElement and write to file
    ply_element = PlyElement.describe(vertex_data, 'vertex')
    PlyData([ply_element]).write(filepath)

    print(f"Saved point cloud with {num_points} points to {filepath}")

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [14]:
# # Example image tensor: replace this with your actual variable
# images = predictions['images']  # shape: [1, 16, 350, 518, 1]

# # Remove batch dimension and channel dimension
# images = images.squeeze(0).squeeze(-1)  # Now shape is [16, 350, 518]

# # Loop and visualize
# for i in range(images.shape[0]):
#     img = images[i].cpu().numpy()

#     plt.figure(figsize=(6, 6))
#     if img.ndim == 2:
#         plt.imshow(img, cmap='gray')  # Grayscale
#     elif img.ndim == 3 and img.shape[0] in [1, 3]:
#         # Permute if channels first
#         if img.shape[0] == 3:
#             img = img.transpose(1, 2, 0)
#         plt.imshow(img)
#     else:
#         raise ValueError(f"Unsupported image shape: {img.shape}")

#     plt.axis('off')
#     plt.title(f"Image {i}")
#     plt.show()

In [15]:
# # Example image tensor: replace this with your actual variable
# images = predictions['depth']  # shape: [1, 16, 350, 518, 1]

# # Remove batch dimension and channel dimension
# images = images.squeeze(0).squeeze(-1)  # Now shape is [16, 350, 518]

# # Loop and visualize
# for i in range(images.shape[0]):
#     img = images[i].cpu().numpy()

#     plt.figure(figsize=(6, 6))
#     if img.ndim == 2:
#         plt.imshow(img, cmap='gray')  # Grayscale
#     elif img.ndim == 3 and img.shape[0] in [1, 3]:
#         # Permute if channels first
#         if img.shape[0] == 3:
#             img = img.transpose(1, 2, 0)
#         plt.imshow(img)
#     else:
#         raise ValueError(f"Unsupported image shape: {img.shape}")

#     plt.axis('off')
#     plt.title(f"Image {i}")
#     plt.show()

In [16]:
# images = predictions['depth_conf']  # shape: [1, 16, 350, 518, 1]

# # Remove batch dimension and channel dimension
# images = images.squeeze(0).squeeze(-1)  # Now shape is [16, 350, 518]

# # Loop and visualize
# for i in range(images.shape[0]):
#     img = images[i].cpu().numpy()

#     plt.figure(figsize=(6, 6))
#     if img.ndim == 2:
#         plt.imshow(img, cmap='gray')  # Grayscale
#     elif img.ndim == 3 and img.shape[0] in [1, 3]:
#         # Permute if channels first
#         if img.shape[0] == 3:
#             img = img.transpose(1, 2, 0)
#         plt.imshow(img)
#     else:
#         raise ValueError(f"Unsupported image shape: {img.shape}")

#     plt.axis('off')
#     plt.title(f"Image {i}")
#     plt.show()

In [17]:
'''
Run inference here
'''
BATCH_SIZE=20
SINGLE_SAMPLE = False
normals = None

predictions = run_batched_camera_inference(model, image_names, batch_size=BATCH_SIZE)

all_extrinsics = predictions["all_extrinsics"]
all_intrinsics = predictions["all_intrinsics"]
all_world_points = predictions["all_world_points"]
depth_conf_maps = predictions["depth_conf_maps"]

b, h, w, c = all_world_points.shape
images = convert_to_colmap_format(image_names, all_extrinsics, image_name_prefix=PREFIX, start_id=START_ID)
write_extrinsics_binary(images, EXTRINSICS_BINARY_PATH)
write_intrinsics_binary_from_poses(all_intrinsics, width, height, (w, h), INTRINSICS_BINARY_PATH)

if SINGLE_SAMPLE:
    all_images = load_and_preprocess_images(image_names).to(device)
    all_images = all_images[0][None, ...]
    # Remove batch dimension
    world_points = all_world_points[0]  # shape: [16, 350, 518, 3]
else:
    all_images = load_and_preprocess_images(image_names).to(device)
    # Remove batch dimension
    world_points = all_world_points  # shape: [16, 350, 518, 3]

print(all_images.shape)
print(all_world_points.shape)

# Flatten to [N, 3]
points_pmap = world_points.reshape(-1, 3)  # shape: [16 * 350 * 518, 3]
colors_pmap = all_images.permute(0,2,3,1).reshape(-1, 3).cpu().numpy()

# Convert to NumPy
points_np = points_pmap
colors_np = colors_pmap

# Optional: Remove invalid points (e.g., zero points)
mask = np.logical_and.reduce([np.isfinite(points_np[:, 0]),
                              np.isfinite(points_np[:, 1]),
                              np.isfinite(points_np[:, 2])])
points_np = points_np[mask]
colors_np = colors_np[mask]

if len(points_np) >= N:
    indices = np.random.choice(len(points_np), N, replace=False)
    sampled_points = points_np[indices]
    sampled_colors = colors_np[indices]
else:
    print(f"Warning: Only {len(points_np)} points available, returning all.")
    sampled_points = points_np
    sampled_colors = colors_np

# Save as PLY
save_point_cloud_ply(PTS_PATH, sampled_points, sampled_colors, normals)

Processing the rest of 16 images in batches of 20...


  0%|          | 0/1 [00:00<?, ?it/s]

first_image.shape: torch.Size([16, 3, 350, 518])


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=dtype):
100%|██████████| 1/1 [00:01<00:00,  1.59s/it]

extrinsic: torch.Size([1, 16, 3, 4])
intrinsic: torch.Size([1, 16, 3, 3])
point_map_unproj: (16, 350, 518, 3)
depth_map: torch.Size([1, 16, 350, 518, 1])
depth_conf_map: torch.Size([1, 16, 350, 518])
batch_tensor: torch.Size([1, 16, 3, 350, 518])
tensor([[[ 1.0000e+00, -5.4717e-05,  2.2769e-05, -1.4901e-06],
         [ 5.4717e-05,  1.0000e+00, -8.1897e-05,  3.2306e-05],
         [-2.2769e-05,  8.1897e-05,  1.0000e+00, -1.0312e-04]],

        [[ 9.9529e-01,  3.3007e-02, -9.1150e-02,  1.2115e-01],
         [-3.3312e-02,  9.9944e-01, -1.7226e-03, -3.7689e-03],
         [ 9.1028e-02,  4.7474e-03,  9.9584e-01, -3.6488e-03]],

        [[ 9.9724e-01,  1.0303e-02, -7.3572e-02,  1.0443e-01],
         [-1.2547e-02,  9.9947e-01, -3.0131e-02,  1.3420e-02],
         [ 7.3206e-02,  3.0986e-02,  9.9684e-01, -2.3621e-01]],

        [[ 8.5601e-01, -2.4255e-01,  4.5653e-01, -4.5044e-01],
         [ 3.0166e-01,  9.5154e-01, -6.0089e-02,  5.5786e-02],
         [-4.1989e-01,  1.8918e-01,  8.8764e-01, -7.00




torch.Size([16, 3, 350, 518])
(16, 350, 518, 3)
Saved point cloud with 1000 points to /home/skhalid/Documents/data/banana/sparse/0/points3D.ply


In [18]:
'''
SAMPLE OBJECT SEGMENTATION (BACKUP)
'''

'\nSAMPLE OBJECT SEGMENTATION (BACKUP)\n'

In [19]:
# from ultralytics import YOLO
# import matplotlib.pyplot as plt

# model = YOLO("yolo11n-seg")

# # results = model("https://ultralytics.com/images/bus.jpg")
# # results = model(["/home/skhalid/Documents/data/tandt_db/tandt/truck/images/000001.jpg"])
# # results = model(["/home/skhalid/Documents/data/nerf_synthetic/lego/images/r_0.png"])
# results = model(["/home/skhalid/Documents/data/bicycle/images_4/_DSC8679.JPG"])

# for result in results:
#     result.show()

# # fig = plt.figure(figsize=(10,10))
# # plt.imshow(results.render()[0])
# # plt.axis("off")
# # plt.show()

In [20]:
# import torch
# import numpy as np
# import matplotlib.pyplot as plt
# import cv2

# from PIL import Image
# from segment_anything import sam_model_registry, SamPredictor
# import clip
# import urllib.request
# from torchvision import transforms

# # Load image
# # image_path = "/home/skhalid/Documents/data/tandt_db/tandt/truck/images/000001.jpg"
# # image_path = "/home/skhalid/Documents/data/nerf_synthetic/lego/images/r_0.png"
# image_path = "/home/skhalid/Documents/data/bicycle/images_4/_DSC8679.JPG"
# # image_path = "/home/skhalid/Documents/data/Synthetic4Relight/hotdog/train/000.png"
# # image_path = "/home/skhalid/Documents/data/data_dtu/DTU_scan24/inputs/images/000000.png"
# # image_path = "/home/skhalid/Documents/data/banana/images/frame_00002.JPG"

# image_rgb = np.array(Image.open(image_path).convert("RGB"))

# # Step 1: Load Segment Anything Model
# sam_checkpoint = "/home/skhalid/Downloads/sam_vit_l.pth"
# model_type = "vit_l"

# sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
# sam.eval().cuda()

# predictor = SamPredictor(sam)
# predictor.set_image(image_rgb)

# # Step 2: Predict masks with SAM (automatic mode)
# from segment_anything import SamAutomaticMaskGenerator

# mask_generator = SamAutomaticMaskGenerator(sam)
# masks = mask_generator.generate(image_rgb)

# # Step 3: Load CLIP
# device = "cuda" if torch.cuda.is_available() else "cpu"
# clip_model, preprocess = clip.load("ViT-L/14", device=device)

# # Step 4: Classify each mask region with CLIP
# def classify_mask_with_clip(image_rgb, mask, clip_model, preprocess):
#     # Crop the mask region
#     # x0, y0, x1, y1 = cv2.boundingRect(mask.astype(np.uint8)).tolist()
#     x0, y0, x1, y1 = cv2.boundingRect(mask.astype(np.uint8))
#     cropped = image_rgb[y0:y1, x0:x1]

#     # Apply mask
#     masked_image = image_rgb.copy()
#     masked_image[~mask.astype(bool)] = 0  # Black out background
    
#     if cropped.shape[0] < 10 or cropped.shape[1] < 10:
#         return "Unknown", 0.0

#     pil_crop = Image.fromarray(cropped)
#     image_input = preprocess(pil_crop).unsqueeze(0).to(device)

#     # Define your candidate labels
#     labels = ["truck", "house", "ground", "tree", "chair", "building", "sky", "clouds", "road", "lego", "grass", "toy", "hotdog", "fruit", "food", "window"]
#     # text_inputs = torch.cat([clip.tokenize(f"a cropped snippet of an image extracted from the tanks and temples dataset that looks like {c}") for c in labels]).to(device)
#     text_inputs = torch.cat([clip.tokenize(f"the cropped image extracted from the mipnerf360 dataset that looks like {c}") for c in labels]).to(device)

#     with torch.no_grad():
#         image_features = clip_model.encode_image(image_input)
#         text_features = clip_model.encode_text(text_inputs)

#         image_features /= image_features.norm(dim=-1, keepdim=True)
#         text_features /= text_features.norm(dim=-1, keepdim=True)

#         similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
#         label_id = similarity.argmax().item()

#         confidence = similarity[0, label_id].item()
#         if confidence < 0.3:
#             return "Unknown", confidence

#         return labels[label_id], confidence

# # Step 5: Visualize masks and CLIP labels
# output_img = image_rgb.copy()
# for mask_data in masks:
#     mask = mask_data['segmentation']
#     label, confidence = classify_mask_with_clip(image_rgb, mask, clip_model, preprocess)

#     if label == "Unknown":
#         continue
# # 
#     # Draw mask
#     color = np.random.randint(0, 255, size=3)
#     output_img[mask] = 0.6 * output_img[mask] + 0.4 * color
# # 
#     # Draw label
#     y, x = np.argwhere(mask).mean(axis=0).astype(int)
#     cv2.putText(output_img, f"{label} {confidence:.2f}", (x, y), cv2.FONT_HERSHEY_SIMPLEX, 2.0, (255, 255, 255), 2)

# # Show results
# plt.figure(figsize=(12, 8))
# plt.imshow(output_img)
# plt.axis('off')
# plt.title("Segment Anything + CLIP classification")
# plt.show()


In [21]:
# import torch
# from transformers import OwlViTProcessor, OwlViTForObjectDetection
# from segment_anything import sam_model_registry, SamPredictor
# from PIL import Image
# import numpy as np
# import cv2
# import matplotlib.pyplot as plt

# # Load OWL-ViT
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)
# processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")

# # Load SAM
# sam_checkpoint = "/home/skhalid/Downloads/sam_vit_l.pth"
# sam = sam_model_registry["vit_l"](checkpoint=sam_checkpoint).to(device).eval()
# predictor = SamPredictor(sam)

In [22]:
# # Load image
# image_path = "/home/skhalid/Documents/data/tandt_db/tandt/truck/images/000001.jpg"
# # image_path = "/home/skhalid/Documents/data/nerf_synthetic/lego/images/r_0.png"
# # image_path = "/home/skhalid/Documents/data/bicycle/images_4/_DSC8679.JPG"
# # image_path = "/home/skhalid/Documents/data/Synthetic4Relight/hotdog/train/000.png"
# # image_path = "/home/skhalid/Documents/data/data_dtu/DTU_scan24/inputs/images/000000.png"
# # image_path = "/home/skhalid/Documents/data/banana/images/frame_00002.JPG"

# image = Image.open(image_path).convert("RGB")
# image_np = np.array(image)

# # Text prompt
# # texts = [["car", "bicycle", "trees", "grass", "ground", "bench", "lego", "fruit"]]  # You can modify this list
# texts = [["truck", "house", "ground", "tree", "chair", "building", "sky", "clouds", "road", "lego", "grass", "toy", "hotdog", "fruit", "food", "window"]]

# # Prepare input for OWL-ViT
# inputs = processor(text=texts, images=image, return_tensors="pt").to(device)

# # Detect with OWL-ViT
# with torch.no_grad():
#     outputs = model(**inputs)

# # Get boxes and scores
# target_sizes = torch.tensor([image.size[::-1]]).to(device)
# results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.3)[0]

# # Run SAM
# predictor.set_image(image_np)

# # Process each box from OWL-ViT
# for box, score, label in zip(results["boxes"], results["scores"], results["labels"]):
#     box = box.cpu().numpy().astype(int)
#     x0, y0, x1, y1 = box
#     print(f"{texts[0][label]}: {score:.2f} at box {box}")

#     # SAM expects box in XYXY format
#     input_box = np.array([x0, y0, x1, y1])
#     masks, _, _ = predictor.predict(box=input_box[None, :], multimask_output=False)

#     # Overlay mask
#     mask = masks[0]
#     overlay = image_np.copy()
#     overlay[mask] = (255, 0, 0)  # Red mask

#     # Draw bounding box
#     cv2.rectangle(overlay, (x0, y0), (x1, y1), (0, 255, 0), 2)
#     cv2.putText(overlay, f"{texts[0][label]} {score:.2f}", (x0, y0 - 10),
#                 cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

#     # Show
#     plt.imshow(overlay)
#     plt.axis("off")
#     plt.show()
