In [1]:
# Set project root
import os
import sys

# Manually set the path to the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
import json
import cv2
import glob
import pandas as pd
import numpy as np
from typing import Literal

from src.pipeline.geometry import derive_metric_homography
from src.data import Template, load_measurements_from_yaml
from src.pipeline.matching import extract_features, match_descriptors, compute_homography
from src.pipeline.geometry import recover_all_poses_from_homography, select_best_solution
from src.utils import load_rgb
from src.pipeline.calibration import CalibrationSimple

In [3]:
# Define paths to data
colmap_path = os.path.join(project_root, 'assets', 'colmap')
images_dir = os.path.join(colmap_path, 'images')
depth_maps_dir = os.path.join(colmap_path, 'depth_maps')
corners_csv_path = os.path.join(colmap_path, 'book_corners_labels.csv')
focal_lengths_path = os.path.join(colmap_path, 'focal_lengths.npy')

# Load focal lengths estimations by DepthPro
focal_lengths = np.load(focal_lengths_path, allow_pickle=True)

# Load corner annotations
corners_df = pd.read_csv(corners_csv_path)

# Load template data
data = load_measurements_from_yaml("../assets/measurements.yaml")

# Ground-truth focal length in pixels
f_gt = 5152.22

In [None]:
def extract_corners(filename: str) -> np.ndarray:
    # Identify correct row in the df
    row = corners_df[corners_df['image_name'] == filename].iloc[0]
    
    # Extract 2D corner coordinates (in image pixels)
    corners = np.array([
        [row['top_left_x'], row['top_left_y']],
        [row['top_right_x'], row['top_right_y']], 
        [row['bottom_right_x'], row['bottom_right_y']],
        [row['bottom_left_x'], row['bottom_left_y']]
    ], dtype=np.float32)

    return corners

def compute_pose_from_corners(
    corners: np.ndarray,
    K: np.ndarray,
    w_m: float,
    h_m: float
) -> tuple[np.ndarray, np.ndarray]:
    # Define book corners in book's local coordinate system
    corners_3d = np.array([
        [0, 0, 0],
        [w_m, 0, 0],
        [w_m, h_m, 0],
        [0, h_m, 0]
    ], dtype=np.float32)
    
    # Solve PnP to get camera pose relative to book
    success, rvec, tvec = cv2.solvePnP(corners_3d, corners, K, None)
    
    if not success:
        print("PnP solving failed!")
        return None, None

    return rvec, tvec

def compute_pose_ground_truth(pose: tuple[np.ndarray, np.ndarray]) -> tuple[float, np.ndarray]:
    rvec, tvec = pose
    R, _ = cv2.Rodrigues(rvec)
    return np.linalg.norm(tvec), R[:, 2]

def template_match(
    scene: np.ndarray,
    templates: list[Template],
    extract_method: Literal['SIFT', 'ORB'] = 'SIFT',
    match_method: Literal['BF', 'FLANN'] = 'BF',
    min_match_count: int = 10,
) -> dict:
    """
    Perform multi-template matching on a scene image for the given templates.

    Args:
        scene (np.ndarray): The scene image.
        templates (list[Template]): List of template objects to match against the scene.
        extract_method (Literal['SIFT', 'ORB']): Feature extraction method.
        match_method (Literal['BF', 'FLANN']): Feature matching method.
        min_match_count (int): Minimum number of matches required to consider a valid match.

    Returns:
        np.ndarray: The image size of the scene.
        dict: A dictionary containing, indexed by template id. For each template:
            - 'homography': The computed homography matrix.
            - 'error': The reprojection error.
    """
    # Initialize the results dictionary
    results = {}

    # Extract its features and descriptors
    scene_keypoints, scene_descriptors = extract_features(scene, method=extract_method, max_features=20000)

    # Drawing
    scene_image_copy = scene.copy()

    # Iterate over each template
    for template in templates:
        # Load the template image
        template_image = load_rgb(os.path.join(project_root, template.path))

        # Extract features and descriptors from the template
        template_keypoints, template_descriptors = extract_features(template_image, method=extract_method)

        # Match the descriptors between the scene and the template
        matches = match_descriptors(template_descriptors, scene_descriptors, method=match_method)

        # Check if enough matches are found
        if len(matches) <= min_match_count:
            print(f"Not enough matches found for template {template.id}.")
            continue
            
        # Compute the homography
        H_px, mask, error = compute_homography(template_keypoints, scene_keypoints, matches)

        # Derive the metric homography
        H_metric = derive_metric_homography(
            H_px=H_px,
            template_size_px=template_image.shape[:2],
            template_size_metric=(template.height, template.width),
        )

        # Remove the scene keypoints that were used as inliers in the homography computation
        inlier_indices = set()
        for i, match in enumerate(matches):
            if mask[i] == 1:  # This match was an inlier
                inlier_indices.add(match.queryIdx)  # queryIdx is the scene keypoint index

        # Keep only the keypoints that weren't used as inliers
        remaining_indices = [i for i in range(len(scene_keypoints)) if i not in inlier_indices]
        scene_keypoints = [scene_keypoints[i] for i in remaining_indices]
        scene_descriptors = scene_descriptors[remaining_indices]

        # Plot template contour on scene image
        template_h, template_w = template_image.shape[:2]
        template_corners = np.float32([[0, 0], [template_w, 0], [template_w, template_h], [0, template_h]]).reshape(-1, 1, 2)
        scene_corners = cv2.perspectiveTransform(template_corners, H_px)

        cv2.polylines(scene_image_copy, [np.int32(scene_corners)], True, (0, 255, 255), 3, cv2.LINE_AA)

        # Store the results
        results[template.id] = {
            'homography': H_metric,
            'error': error
        }

    return results

def calibrate_camera(
    homographies: list[np.ndarray],
    image_size: np.ndarray
) -> np.ndarray:
    """
    Calibrate the camera using a set of homographies.
    For simplicity, only the focal length is estimated, based on the following assumptions:
    - zero skew
    - principal point at the image center
    - square pixels (fx = fy = f)

    Args:
        homographies (list[np.ndarray]): List of homographies.
        image_size (tuple[int, int]): Size of the image (height, width).
    
    Returns:
        np.ndarray: Estimated intrinsic camera matrix (3x3).
    """
    # Compute the principal point
    cx, cy = image_size[1] / 2, image_size[0] / 2

    # Initialize the calibration object
    calibration = CalibrationSimple()

    # Calibrate the camera
    calibration.add_homographies(homographies)
    return calibration.calibrate(principal_point=(cx, cy))

def refine_calibration(
    templates: list[Template],
    homographies: list[np.ndarray],
    image_size: np.ndarray,
    K_init: np.ndarray,
    resolution: int = 20,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Refine the camera intrinsics and estimate radial distortion parameters (k1, k2).

    Args:
        templates (list[Template]): List of template objects containing metric dimensions.
        homographies (list[np.ndarray]): List of homographies for the scene.
        image_size (np.ndarray): Size of the image (width, height).
        K_init (np.ndarray): Initial intrinsic matrix (3x3).
        resolution (int): Resolution for the grid of points used for each template.

    Returns:
        tuple: Refined intrinsic matrix (3x3) and radial distortion parameters (1D array).
    """
    # Define the world points and the corresponding image points for each template
    object_points = []
    image_points = []
    for template, H in zip(templates, homographies):
        # Get template metric dimensions
        w, h = template.width, template.height

        # Define grid points on the template
        x = np.linspace(0, w, resolution)
        y = np.linspace(0, h, resolution)
        X, Y = np.meshgrid(x, y)

        # Create world points
        object_points_3d = np.array([
            [x, y, 0] for x, y, in zip(X.flatten(), Y.flatten())
        ], dtype=np.float32)

        # Create image points
        image_points_2d = cv2.perspectiveTransform(
            object_points_3d[:, :2].reshape(-1, 1, 2), H
        ).reshape(-1, 2)

        # Add these points to the lists
        object_points.append(object_points_3d)
        image_points.append(image_points_2d)

    # Initialize the distortion coefficients to zero
    dist_coeffs_init = np.zeros(5, dtype=np.float32)

    # Define the flags for the optimization
    flags = (
        cv2.CALIB_USE_INTRINSIC_GUESS |
        cv2.CALIB_FIX_PRINCIPAL_POINT |
        cv2.CALIB_FIX_ASPECT_RATIO |
        cv2.CALIB_ZERO_TANGENT_DIST |
        cv2.CALIB_FIX_K3 |
        cv2.CALIB_FIX_K4 |
        cv2.CALIB_FIX_K5 |
        cv2.CALIB_FIX_K6
    )

    # Refine the intrinsic parameters and distortion coefficients
    ret, K, dist_coeffs, rvecs, tvecs = cv2.calibrateCamera(
        objectPoints=object_points,
        imagePoints=image_points,
        imageSize=image_size,
        cameraMatrix=K_init,
        distCoeffs=dist_coeffs_init,
        flags=flags
    )

    if not ret:
        raise RuntimeError("Camera calibration failed. Check the input data and parameters.")
    
    # Return the refined intrinsic matrix and distortion coefficients
    return K, dist_coeffs[:2]

def analyze_scene(
    templates: list[Template],
    homographies: list[np.ndarray],
    K: np.ndarray
) -> dict:
    """
    Analyze a scene by computing the distance from the camera to the center of each template.

    Args:
        templates (list[Template]): List of template objects containing metric dimensions.
        homographies (list[np.ndarray]): List of homographies for the scene.
        K (np.ndarray): Intrinsic camera matrix.

    Returns:
        dict: A dictionary containing, indexed by template id. For each template:
            - the estimated distance
            - the estimated normal to the template plane
    """
    # Iterate over templates
    results = {}
    for template, H_metric in zip(templates, homographies):
        # Get pose from homography
        poses = recover_all_poses_from_homography(H_metric, K)
        best_pose = select_best_solution(poses)
        
        if best_pose is None:
            continue
            
        R, t, _ = best_pose

        results[template.id] = {
            'd': np.linalg.norm(t) / 1000,  # Convert to meters
            'n': R[:, 2]
        }

    return results

def pipeline_results(image: np.ndarray) -> tuple[float, float, np.ndarray]:
    # Load the template
    template_id = "T5"
    templates = [data.get_template(template_id)]

    # Perform template matching
    results = template_match(image, templates)

    # Extract homographies from results
    homographies = [result['homography'] for result in results.values()]

    # Calibrate the camera using the homographies
    K_init = calibrate_camera(homographies=homographies, image_size=image.shape[:2])

    # Refine the calibration using the templates and homographies
    K_refined, _ = refine_calibration(
        templates=templates,
        homographies=homographies,
        image_size=image.shape[:2],
        K_init=K_init
    )

    # Analyze the scene
    results = analyze_scene(
        templates=templates,
        homographies=homographies,
        K=K_refined
    )[template_id]

    return K_refined[0, 0], results['d'], results['n']

def get_plane_normal_from_corners(corners: np.ndarray, depth_map: np.ndarray, focal_length: float) -> np.ndarray:
    # Camera parameters
    h, w = depth_map.shape[:2]
    cx = w / 2
    cy = h / 2
    
    # Convert 2D corners to 3D using depth
    corners_3d = []
    for x, y in corners:
        depth = depth_map[int(y), int(x)]
        # Convert to 3D camera coordinates
        x_3d = (x - cx) * depth / focal_length
        y_3d = (y - cy) * depth / focal_length
        z_3d = depth
        corners_3d.append([x_3d, y_3d, z_3d])
    
    corners_3d = np.array(corners_3d)

    
    # Fit plane using SVD
    centroid = np.mean(corners_3d, axis=0)
    centered_points = corners_3d - centroid
    _, _, Vt = np.linalg.svd(centered_points)
    normal = Vt[-1]  # Last row of Vt is the normal vector
    
    return normal

def depth_pro_results(corners: np.ndarray, depth_map: np.ndarray, focal_length: float) -> tuple[float, float, np.ndarray]:
    depth = depth_map[int(corners[0][1]), int(corners[0][0])]
    normal = get_plane_normal_from_corners(corners, depth_map, focal_length)
    return depth, normal

In [5]:
def compare():
    # Instantiate results
    f_comparison = {}
    distance_comparison = {}
    normal_comparison = {}

    # Loop through each image
    filenames = glob.glob(os.path.join(images_dir, '*.jpg'))
    for i, filename in enumerate(sorted(filenames)):
        image_name = os.path.basename(filename)
        print(f"[{i + 1}/{len(filenames)}] {image_name}")

        # Load image and depth map
        image = load_rgb(filename)
        depth_path = os.path.join(depth_maps_dir, image_name.replace('.jpg', '_map.npy'))
        depth_map = np.load(depth_path).T

        # Compute ground-truth K
        h, w = image.shape[:2]
        K_gt = np.array([
            [f_gt, 0, w / 2],
            [0, f_gt, h / 2],
            [0, 0, 1]
        ])

        # Compute ground-truth camera pose from annotated corners
        corners = extract_corners(image_name)
        gt_pose = compute_pose_from_corners(corners, K_gt, w_m=0.173, h_m=0.26)

        # Ground-truth distance and normal
        d_gt, n_gt = compute_pose_ground_truth(gt_pose)

        # Pipeline results
        f_pp, d_pp, n_pp = pipeline_results(image)

        # DepthPro results
        f_dp = focal_lengths.item()[image_name]
        d_dp, n_dp = depth_pro_results(corners, depth_map, f_dp)

        # Store results
        f_comparison[image_name] = {
            'pipeline': f_pp,
            'depth-pro': f_dp,
            'gt': f_gt,
        }
        distance_comparison[image_name] = {
            'pipeline': d_pp,
            'depth-pro': d_dp,
            'gt': d_gt,
        }
        normal_comparison[image_name] = {
            'pipeline': n_pp,
            'depth-pro': n_dp,
            'gt': n_gt,
        }

    return f_comparison, distance_comparison, normal_comparison

In [6]:
f_comp, d_comp, n_comp = compare()

[1/50] IMG_8222.jpg
[2/50] IMG_8223.jpg
[3/50] IMG_8224.jpg
[4/50] IMG_8225.jpg
[5/50] IMG_8226.jpg
[6/50] IMG_8227.jpg
[7/50] IMG_8228.jpg
[8/50] IMG_8229.jpg
[9/50] IMG_8230.jpg
[10/50] IMG_8231.jpg
[11/50] IMG_8232.jpg
[12/50] IMG_8233.jpg
[13/50] IMG_8234.jpg
[14/50] IMG_8235.jpg
[15/50] IMG_8236.jpg
[16/50] IMG_8237.jpg
[17/50] IMG_8238.jpg
[18/50] IMG_8239.jpg
[19/50] IMG_8240.jpg
[20/50] IMG_8241.jpg
[21/50] IMG_8242.jpg
[22/50] IMG_8243.jpg
[23/50] IMG_8244.jpg
[24/50] IMG_8245.jpg
[25/50] IMG_8246.jpg
[26/50] IMG_8247.jpg
[27/50] IMG_8248.jpg
[28/50] IMG_8249.jpg
[29/50] IMG_8250.jpg
[30/50] IMG_8251.jpg
[31/50] IMG_8252.jpg
[32/50] IMG_8253.jpg
[33/50] IMG_8254.jpg
[34/50] IMG_8255.jpg
[35/50] IMG_8256.jpg
[36/50] IMG_8257.jpg
[37/50] IMG_8258.jpg
[38/50] IMG_8259.jpg
[39/50] IMG_8260.jpg
[40/50] IMG_8261.jpg
[41/50] IMG_8262.jpg
[42/50] IMG_8263.jpg
[43/50] IMG_8264.jpg
[44/50] IMG_8265.jpg
[45/50] IMG_8266.jpg
[46/50] IMG_8268.jpg
[47/50] IMG_8269.jpg
[48/50] IMG_8270.jpg
[

In [7]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, (np.float64, np.float32, np.int64, np.int32)):
            return float(obj)
        return super().default(obj)

# Save all three dictionaries
data_to_save = {
    'f_comp': f_comp,
    'd_comp': d_comp,
    'n_comp': n_comp
}

output_path = os.path.join(colmap_path, 'comparisons_v2.json')
with open(output_path, 'w') as f:
    json.dump(data_to_save, f, cls=NumpyEncoder, indent=2)