In [4]:
import os.path
import random

import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [32]:
look_at = torch.tensor([0, 0, 0], dtype=torch.float32)
camera_position = torch.tensor([2, 2, 2], dtype=torch.float32)
z_axis = camera_position - look_at
z_axis = z_axis / torch.norm(z_axis)
z_axis

tensor([0.5774, 0.5774, 0.5774])

In [35]:
up_world = torch.tensor([0, 0, 1], dtype=torch.float32)
x_axis = torch.cross(up_world, z_axis)
x_axis = x_axis / torch.norm(x_axis)
x_axis

tensor([-0.7071,  0.7071,  0.0000])

In [36]:
y_axis = torch.cross(z_axis, x_axis)
y_axis = y_axis / torch.norm(y_axis)
y_axis

tensor([-0.4082, -0.4082,  0.8165])

In [37]:
extrinsic_matrix = torch.stack([x_axis, y_axis, z_axis, camera_position], dim=-1)
extrinsic_matrix

tensor([[-0.7071, -0.4082,  0.5774,  2.0000],
        [ 0.7071, -0.4082,  0.5774,  2.0000],
        [ 0.0000,  0.8165,  0.5774,  2.0000]])

In [38]:
world_point = torch.tensor([1, 1, 1, 1], dtype=torch.float32)  # Homogeneous coordinates

In [44]:
rotation_matrix = extrinsic_matrix[:3, :3]
translation_vector = extrinsic_matrix[:3, 3]
inverse_rotation = rotation_matrix.T  # Transpose of the rotation matrix
inverse_translation = -inverse_rotation @ translation_vector

inverse_extrinsic = torch.eye(4)
inverse_extrinsic[:3, :3] = inverse_rotation
inverse_extrinsic[:3, 3] = inverse_translation
inverse_extrinsic

tensor([[-0.7071,  0.7071,  0.0000,  0.0000],
        [-0.4082, -0.4082,  0.8165,  0.0000],
        [ 0.5774,  0.5774,  0.5774, -3.4641],
        [ 0.0000,  0.0000,  0.0000,  1.0000]])

In [41]:
camera_point = inverse_extrinsic @ world_point
camera_point

tensor([ 0.0000,  0.0000, -1.7321,  1.0000])

# 4x4 RT matrix

In [43]:
RT_matrix_4x4 = np.eye(4)
RT_matrix_4x4[:3, :3] = rotation_matrix
RT_matrix_4x4[:3, 3] = translation_vector

torch.tensor(RT_matrix_4x4)

tensor([[-0.7071, -0.4082,  0.5774,  2.0000],
        [ 0.7071, -0.4082,  0.5774,  2.0000],
        [ 0.0000,  0.8165,  0.5774,  2.0000],
        [ 0.0000,  0.0000,  0.0000,  1.0000]], dtype=torch.float64)

In [47]:
torch.tensor(RT_matrix_4x4) @ camera_point.to(dtype=torch.double)

tensor([1.0000, 1.0000, 1.0000, 1.0000], dtype=torch.float64)

In [57]:
fx = fy = 384
cx = cy = 256
w = h = 512
intrinsics = torch.tensor([
    [fx, fy],
    [cx, cy],
    [w, h],
], dtype=torch.float32)

intrinsics

tensor([[384., 384.],
        [256., 256.],
        [512., 512.]])

In [53]:
def get_normalized_camera_intrinsics(intrinsics: torch.Tensor):
    """
    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
    Return batched fx, fy, cx, cy
    """
    fx, fy = intrinsics[:, 0, 0], intrinsics[:, 0, 1]
    cx, cy = intrinsics[:, 1, 0], intrinsics[:, 1, 1]
    width, height = intrinsics[:, 2, 0], intrinsics[:, 2, 1]
    fx, fy = fx / width, fy / height
    cx, cy = cx / width, cy / height
    return fx, fy, cx, cy


def build_camera_principle(RT: torch.Tensor, intrinsics: torch.Tensor):
    """
    RT: (N, 3, 4)
    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
    """
    fx, fy, cx, cy = get_normalized_camera_intrinsics(intrinsics)
    print(fx, fy, cx, cy)
    return torch.cat([
        RT.reshape(-1, 12),
        fx.unsqueeze(-1), fy.unsqueeze(-1), cx.unsqueeze(-1), cy.unsqueeze(-1),
    ], dim=-1)

In [55]:
dist_to_center = 2
canonical_camera_extrinsics = torch.tensor([[
    [1, 0, 0, 0],
    [0, 0, -1, -dist_to_center],
    [0, 1, 0, 0],
]], dtype=torch.float32)
canonical_camera_intrinsics = intrinsics.unsqueeze(0)
source_camera = build_camera_principle(canonical_camera_extrinsics, canonical_camera_intrinsics)
source_camera.shape

tensor([0.7500]) tensor([0.7500]) tensor([0.5000]) tensor([0.5000])


torch.Size([1, 16])

In [66]:
import math

def center_looking_at_camera_pose(camera_position: torch.Tensor, look_at: torch.Tensor = None, up_world: torch.Tensor = None):
    """
    camera_position: (M, 3)
    look_at: (3)
    up_world: (3)
    return: (M, 3, 4)
    """
    # by default, looking at the origin and world up is pos-z
    if look_at is None:
        look_at = torch.tensor([0, 0, 0], dtype=torch.float32)  # object is  at origin
    if up_world is None:
        up_world = torch.tensor([0, 0, 1], dtype=torch.float32)  # z-axis is upwards
    look_at = look_at.unsqueeze(0).repeat(camera_position.shape[0], 1)
    up_world = up_world.unsqueeze(0).repeat(camera_position.shape[0], 1)

    z_axis = camera_position - look_at
    z_axis = z_axis / z_axis.norm(dim=-1, keepdim=True)
    x_axis = torch.cross(up_world, z_axis)
    x_axis = x_axis / x_axis.norm(dim=-1, keepdim=True)
    y_axis = torch.cross(z_axis, x_axis)
    y_axis = y_axis / y_axis.norm(dim=-1, keepdim=True)
    extrinsics = torch.stack([x_axis, y_axis, z_axis, camera_position], dim=-1)
    return extrinsics

def _get_surrounding_views(M: int = 2, radius: float = 2.0, height: float = 0.8):
    # M: number of surrounding views
    # radius: camera dist to center
    # height: height of the camera
    # return: (M, 3, 4)
    assert M > 0
    assert radius > 0

    camera_positions = []
    projected_radius = math.sqrt(radius ** 2 - height ** 2)
    for i in range(M):
        theta = 2 * math.pi * i / M - math.pi / 2  # starting point is top (y-axis)
        x = projected_radius * math.cos(theta)
        y = projected_radius * math.sin(theta)
        z = height
        camera_positions.append([x, y, z])
    camera_positions = torch.tensor(camera_positions, dtype=torch.float32)
    extrinsics = center_looking_at_camera_pose(camera_positions)
    return extrinsics

In [67]:
render_camera_extrinsics = _get_surrounding_views()
render_camera_intrinsics = intrinsics.unsqueeze(0).repeat(render_camera_extrinsics.shape[0], 1, 1)
render_camera_extrinsics.shape

torch.Size([2, 3, 4])

In [70]:
render_camera_extrinsics[0], render_camera_intrinsics[0]

(tensor([[ 1.0000e+00, -2.4493e-17,  5.6120e-17,  1.1224e-16],
         [ 6.1232e-17,  4.0000e-01, -9.1652e-01, -1.8330e+00],
         [-0.0000e+00,  9.1652e-01,  4.0000e-01,  8.0000e-01]]),
 tensor([[384., 384.],
         [256., 256.],
         [512., 512.]]))

In [73]:
def compose_extrinsic_RT(RT: torch.Tensor):
    """
    Compose the standard form extrinsic matrix from RT.
    Batched I/O.
    """
    return torch.cat([
        RT,
        torch.tensor([[[0, 0, 0, 1]]], dtype=torch.float32).repeat(RT.shape[0], 1, 1)
        ], dim=1)


E = compose_extrinsic_RT(render_camera_extrinsics)
print(E)
fx, fy, cx, cy = get_normalized_camera_intrinsics(render_camera_intrinsics)
I = torch.stack([
    torch.stack([fx, torch.zeros_like(fx), cx], dim=-1),
    torch.stack([torch.zeros_like(fy), fy, cy], dim=-1),
    torch.tensor([[0, 0, 1]], dtype=torch.float32, device=render_camera_extrinsics.device).repeat(render_camera_extrinsics.shape[0], 1),
], dim=1)
torch.cat([
    E.reshape(-1, 16),
    I.reshape(-1, 9),
], dim=-1).shape

tensor([[[ 1.0000e+00, -2.4493e-17,  5.6120e-17,  1.1224e-16],
         [ 6.1232e-17,  4.0000e-01, -9.1652e-01, -1.8330e+00],
         [-0.0000e+00,  9.1652e-01,  4.0000e-01,  8.0000e-01],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]],

        [[-1.0000e+00, -2.4493e-17,  5.6120e-17,  1.1224e-16],
         [ 6.1232e-17, -4.0000e-01,  9.1652e-01,  1.8330e+00],
         [ 0.0000e+00,  9.1652e-01,  4.0000e-01,  8.0000e-01],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]])


torch.Size([2, 25])

# Dataset Rendering

In [94]:
cam = np.load("temp_data/render/000.npy")
cam.shape

(3, 4)

In [99]:
cam

array([[-8.42718780e-01,  5.38353980e-01,  3.31587557e-09,
        -8.96162987e-02],
       [ 5.95299751e-02,  9.31859463e-02,  9.93867457e-01,
         7.56601989e-03],
       [ 5.35052538e-01,  8.37550759e-01, -1.10577732e-01,
        -2.07504749e+00]])

In [100]:
np.vstack([cam, np.array([0, 0, 0, 1], dtype=np.float64)])

array([[-8.42718780e-01,  5.38353980e-01,  3.31587557e-09,
        -8.96162987e-02],
       [ 5.95299751e-02,  9.31859463e-02,  9.93867457e-01,
         7.56601989e-03],
       [ 5.35052538e-01,  8.37550759e-01, -1.10577732e-01,
        -2.07504749e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00]])

# Training

In [13]:
import torch
a = torch.rand(3,3)*275
a.max(), a.min()

(tensor(267.0834), tensor(98.0923))

In [15]:
import cv2 as cv
cv.imread("temp_data/render/impeller/000.png").shape

(1024, 1024, 3)