In [1]:
# Setup the environment and import the necessary libraries

import numpy as np
import matplotlib
matplotlib.use('TkAgg') # since PyCharm has some color grading effects
import matplotlib.pyplot as plt
import open3d as o3d

import os
import random
from pathlib import Path

In [2]:
import torch
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import cv2

In [4]:
# https://figshare.com/articles/dataset/Indoor-Outdoor_dataset/4595323
# Randomly sample images from the indoor outdoor dataset

# Read in sample images
data_root = Path("data")
data_folder_path = data_root / "Indoor-Outdoor-JPGs"
indoor_folder_path = data_folder_path / "indoor"
outdoor_folder_path = data_folder_path / "outdoor"
num_samples = 3

selected_indoor_images = random.sample(os.listdir(indoor_folder_path), num_samples)
selected_outdoor_images = random.sample(os.listdir(outdoor_folder_path), num_samples)
# elected_indoor_images = ["1 (1).jpg"]
# selected_outdoor_images = ["1 (1).jpg"]

indoor_images = []
outdoor_images = []
for i in range(num_samples):
    # Flip from BGR to RGB since jpg, and add to list
    indoor_image = cv2.imread(str(indoor_folder_path / selected_indoor_images[i]), cv2.IMREAD_UNCHANGED)
    indoor_image = cv2.cvtColor(indoor_image, cv2.COLOR_BGR2RGB)
    # indoor_image = 255 - indoor_image
    outdoor_image = cv2.imread(str(outdoor_folder_path / selected_outdoor_images[i]), cv2.IMREAD_UNCHANGED)
    outdoor_image = cv2.cvtColor(outdoor_image, cv2.COLOR_BGR2RGB)
    # outdoor_image = 255 - outdoor_image

    indoor_images.append(indoor_image)
    outdoor_images.append(outdoor_image)

    # Check that shapes are reasonable
    print(indoor_image.shape, outdoor_image.shape)

(144, 210, 3) (600, 407, 3)
(469, 725, 3) (600, 407, 3)
(182, 276, 3) (407, 600, 3)


In [5]:
# Setting accelerator to either cuda (Nvidia), metal performance shaders (mps on Mac), or default cpu
device = torch.device("cuda") if torch.cuda.is_available() else (torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu"))
print(device)

mps


In [6]:
# https://github.com/LiheYoung/Depth-Anything
# Import depth anything model small, base, and large
processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-large-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-large-hf").to(device)

print(model.config)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


DepthAnythingConfig {
  "architectures": [
    "DepthAnythingForDepthEstimation"
  ],
  "backbone": null,
  "backbone_config": {
    "apply_layernorm": true,
    "architectures": [
      "Dinov2Model"
    ],
    "attention_probs_dropout_prob": 0.0,
    "drop_path_rate": 0.0,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 1024,
    "image_size": 518,
    "initializer_range": 0.02,
    "layer_norm_eps": 1e-06,
    "layerscale_value": 1.0,
    "mlp_ratio": 4,
    "model_type": "dinov2",
    "num_attention_heads": 16,
    "num_channels": 3,
    "num_hidden_layers": 24,
    "out_features": [
      "stage21",
      "stage22",
      "stage23",
      "stage24"
    ],
    "out_indices": [
      21,
      22,
      23,
      24
    ],
    "patch_size": 14,
    "qkv_bias": true,
    "reshape_hidden_states": false,
    "stage_names": [
      "stem",
      "stage1",
      "stage2",
      "stage3",
      "stage4",
      "stage5",
      "stage6",
      "stage7",
      "s

In [7]:
# Pre-process the images
indoor_samples = []
outdoor_samples = []
for i in range(num_samples):
    indoor_input = processor(images=indoor_images[i], return_tensors="pt").to(device)
    outdoor_input = processor(images=outdoor_images[i], return_tensors="pt").to(device)

    # Infer model
    with torch.no_grad():
        indoor_outputs = model(**indoor_input)
        outdoor_outputs = model(**outdoor_input)
        indoor_depth = indoor_outputs.predicted_depth
        outdoor_depth = outdoor_outputs.predicted_depth

    # Pytorch tensors to numpy arrays
    indoor_depth = indoor_depth.squeeze().cpu().numpy()
    outdoor_depth = outdoor_depth.squeeze().cpu().numpy()

    indoor_samples.append([indoor_images[i], indoor_depth])
    outdoor_samples.append([outdoor_images[i], outdoor_depth])

In [18]:
# Visualize the depth image and original images side by side
fig, axs = plt.subplots(num_samples, 4, figsize=(12, 3*num_samples))
if num_samples == 1:
    axs = axs.reshape(1, 4)

for i in range(num_samples):
    axs[i, 0].imshow(indoor_samples[i][0])
    axs[i, 0].set_title("Indoor Original")
    axs[i, 0].axis('off')

    axs[i, 1].imshow(indoor_samples[i][1])
    axs[i, 1].set_title("Indoor Depth")
    axs[i, 1].axis('off')

    axs[i, 2].imshow(outdoor_samples[i][0])
    axs[i, 2].set_title("Outdoor Original")
    axs[i, 2].axis('off')

    axs[i, 3].imshow(outdoor_samples[i][1])
    axs[i, 3].set_title("Outdoor Depth")
    axs[i, 3].axis('off')

fig.tight_layout()
plt.show()

In [9]:
def get_intrinsics(H, W, fov=55.0):
    """
    Intrinsics for a pinhole camera model.
    Assume fov of 55 degrees and central principle point.
    """
    f = 0.5 * W / np.tan((np.pi / 2) * fov / 180.0)
    cx = 0.5 * W
    cy = 0.5 * H
    # Assuming square pixels so can use f for both x and y
    return np.array([[f, 0, cx],
                     [0, f, cy],
                     [0, 0, 1]])

In [10]:
def pixel_to_point(depth_image, camera_intrinsics=None):
    """
    Converts depth image to 3D points.
    Assumes fov of 55 degrees and central principle point.
    """

    height, width = depth_image.shape
    if camera_intrinsics is None:
        camera_intrinsics = get_intrinsics(height, width)

    # Create u, v mesh grid and precompute projection triangle ratios
    fx, fy = camera_intrinsics[0,0], camera_intrinsics[1,1]
    cx, cy = camera_intrinsics[0,2], camera_intrinsics[1,2]

    # Blank mesh grid of correct size
    x = np.linspace(0, width - 1, width)
    y = np.linspace(0, height - 1, height)
    u, v = np.meshgrid(x, y)

    x_over_z = (u - cx) / fx
    y_over_z = (v - cy) / fy

    # 3D Pythagorean theorem rearranged to solve for z
    z = depth_image / np.sqrt(1.0 + x_over_z**2 + y_over_z**2)
    x = x_over_z * z
    y = y_over_z * z

    return x, y, z

In [16]:
def create_point_cloud(depth_image, color_image, camera_intrinsics=None, scale_ratio=100.0):
    height, width = depth_image.shape
    if camera_intrinsics is None:
        camera_intrinsics = get_intrinsics(height, width)

    # Rescaling color image to the size of the depth image
    # since DepthAnything was trained on only 518x518 models,
    # it automatically resizes its inputs
    color_image = cv2.resize(color_image, (width, height))

    # Making sure depth image does not contain any zeroes
    depth_image = np.maximum(depth_image, 1e-5)

    depth_image = scale_ratio / depth_image
    x, y, z = pixel_to_point(depth_image, camera_intrinsics)
    point_image = np.stack((x, y, z), axis=-1)
    # xyzrgb_image = np.concatenate([point_image, color_image], axis=-1)
    # xyzrgb not needed as o3d has that capability

    cloud = o3d.geometry.PointCloud()
    cloud.points = o3d.utility.Vector3dVector(point_image.reshape(-1, 3))
    cloud.colors = o3d.utility.Vector3dVector(color_image.reshape(-1, 3) / 255.0)

    # Masking for outdoor skies
    mask = point_image[:,:,2] < 1e3
    cloud.points = o3d.utility.Vector3dVector(point_image[mask].reshape(-1, 3))
    cloud.colors = o3d.utility.Vector3dVector(color_image[mask].reshape(-1, 3) / 255.0)

    return cloud

In [17]:
# Save point-cloud files
output_path = data_root / "point_clouds"
os.makedirs(output_path, exist_ok=True)

for i in range(num_samples):
    cloud = create_point_cloud(depth_image=indoor_samples[i][1], color_image=indoor_samples[i][0])
    o3d.io.write_point_cloud(output_path / f"indoor_point_cloud_{i}.ply", cloud)

    cloud = create_point_cloud(depth_image=outdoor_samples[i][1], color_image=outdoor_samples[i][0])
    o3d.io.write_point_cloud(output_path / f"outdoor_point_cloud_{i}.ply", cloud)

In [21]:
# Visualize your point-cloud in an interactive window
version, number = "indoor", 2
cloud_path = output_path / f"{version}_point_cloud_{number}.ply"
pcd = o3d.io.read_point_cloud(cloud_path)
o3d.visualization.draw_geometries([pcd])


In [43]:
# Visualize your point-cloud in an interactive window (fov slider)
import tkinter as tk
from tkinter import ttk

default_fov = 55
default_scale = 1.0
default_offset = 0.0
class FOVVisualizerApp:
    def __init__(self, master, depth_image, color_image):
        self.master = master
        self.depth_image = depth_image
        self.color_image = color_image

        self.master.title("FOV Camera Intrinsics Visualizer")

        # FOV slider and label
        label = ttk.Label(master, text="Field of View (degrees):")
        label.pack(pady=5)

        self.fov_slider = ttk.Scale(master, from_=10, to=120, orient='horizontal',
                                    command=self.update_fov_or_scale)
        self.fov_slider.set(default_fov)
        self.fov_slider.pack(fill='x', padx=10, pady=5)

        self.fov_value_label = ttk.Label(master, text=f"FOV: {default_fov}°")
        self.fov_value_label.pack(pady=(0, 10))

        # Depth scale slider and label
        label_scale = ttk.Label(master, text="Depth Scale (larger = bigger):")
        label_scale.pack(pady=5)

        self.scale_slider = ttk.Scale(master, from_=1, to=10, orient='horizontal',
                                     command=self.update_fov_or_scale)
        self.scale_slider.set(default_scale)
        self.scale_slider.pack(fill='x', padx=10, pady=5)

        self.scale_value_label = ttk.Label(master, text=f"Depth Scale: {default_scale}")
        self.scale_value_label.pack(pady=(0, 10))

        # Offset slider and label
        label_offset = ttk.Label(master, text="Offset (larger = closer):")
        label_offset.pack(pady=5)

        self.offset_slider = ttk.Scale(master, from_=-5, to=20, orient='horizontal',
                                     command=self.update_fov_or_scale)
        self.offset_slider.set(default_offset)
        self.offset_slider.pack(fill='x', padx=10, pady=5)

        self.offset_value_label = ttk.Label(master, text=f"Offset: {default_offset}")
        self.offset_value_label.pack(pady=(0, 10))

        # Open3D visualizer init
        self.vis = o3d.visualization.Visualizer()
        self.vis.create_window(window_name="Point Cloud", width=800, height=600, visible=True)
        self.pcd = None

        # Initial display
        self.update_point_cloud(default_fov, default_scale, default_offset)

        # Start periodic visualizer update
        self.update_visualizer()

        self.master.protocol("WM_DELETE_WINDOW", self.on_closing)

    def update_point_cloud(self, fov, scale_ratio, offset):
        fov = float(fov)
        H, W = self.depth_image.shape
        intrinsics = get_intrinsics(H, W, fov)

        temp_depth = self.depth_image.copy() ** (1.0 / scale_ratio) + offset
        cloud = create_point_cloud(temp_depth, self.color_image, intrinsics)

        if self.pcd is None:
            self.pcd = cloud
            self.vis.add_geometry(self.pcd)
        else:
            self.pcd.points = cloud.points
            self.pcd.colors = cloud.colors
            self.vis.update_geometry(self.pcd)

    def update_fov_or_scale(self, val):
        # Get current slider values
        current_fov = self.fov_slider.get()
        current_scale = self.scale_slider.get()
        current_offset = self.offset_slider.get()

        # Update labels
        self.fov_value_label.config(text=f"FOV: {current_fov:.1f}°")
        self.scale_value_label.config(text=f"Depth Scale: {int(current_scale)}")
        self.offset_value_label.config(text=f"Offset: {int(current_offset)}")

        # Update point cloud with current parameters
        self.update_point_cloud(current_fov, current_scale, current_offset)

    def update_visualizer(self):
        self.vis.poll_events()
        self.vis.update_renderer()
        self.master.after(30, self.update_visualizer)

    def on_closing(self):
        self.vis.destroy_window()
        self.master.destroy()


def run_visualizer():
    dataset, index = outdoor_samples, 2

    depth_image = dataset[index][1]
    color_image = dataset[index][0]

    root = tk.Tk()
    app = FOVVisualizerApp(root, depth_image, color_image)
    root.mainloop()

run_visualizer()

Exception in Tkinter callback
Traceback (most recent call last):
  File "/Users/sammygadekar/miniconda3/lib/python3.13/tkinter/__init__.py", line 2068, in __call__
    return self.func(*args)
           ~~~~~~~~~^^^^^^^
  File "/var/folders/27/hd37c_md0nj25ymwhp4tsz_m0000gn/T/ipykernel_28217/700417097.py", line 84, in update_fov_or_scale
    current_scale = self.scale_slider.get()
                    ^^^^^^^^^^^^^^^^^
AttributeError: 'FOVVisualizerApp' object has no attribute 'scale_slider'
Exception in Tkinter callback
Traceback (most recent call last):
  File "/Users/sammygadekar/miniconda3/lib/python3.13/tkinter/__init__.py", line 2068, in __call__
    return self.func(*args)
           ~~~~~~~~~^^^^^^^
  File "/var/folders/27/hd37c_md0nj25ymwhp4tsz_m0000gn/T/ipykernel_28217/700417097.py", line 85, in update_fov_or_scale
    current_offset = self.offset_slider.get()
                     ^^^^^^^^^^^^^^^^^^
AttributeError: 'FOVVisualizerApp' object has no attribute 'offset_slider'
Exc