From f7e1a33642a6750324e6e74bf874a98f44c4a2b3 Mon Sep 17 00:00:00 2001
From: duinodu <472365351duino@gmail.com>
Date: Thu, 6 Nov 2025 13:17:13 +0800
Subject: [PATCH 1/2] update rerun sample

---
 bindings/spatialmp4.cpp            | 19 +++++++------------
 examples/python/visualize_rerun.py | 13 +++++++++++++
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/bindings/spatialmp4.cpp b/bindings/spatialmp4.cpp
index 14de04f..8bacebb 100644
--- a/bindings/spatialmp4.cpp
+++ b/bindings/spatialmp4.cpp
@@ -85,6 +85,7 @@ PYBIND11_MODULE(spatialmp4, m) {
   m.def("get_major_version", &SpatialML::GetMajorVersion, "Get the major version number");
   m.def("get_minor_version", &SpatialML::GetMinorVersion, "Get the minor version number");
   m.def("get_patch_version", &SpatialML::GetPatchVersion, "Get the patch version number");
+  m.attr("HEAD_MODEL_OFFSET") = Eigen::Vector3d(-0.05057, -0.01874, 0.04309);
 
   // Bind StreamType enum
   py::enum_<SpatialML::StreamType>(m, "StreamType")
@@ -108,9 +109,7 @@ PYBIND11_MODULE(spatialmp4, m) {
       .def_readwrite("qx", &SpatialML::pose_frame::qx)
       .def_readwrite("qy", &SpatialML::pose_frame::qy)
       .def_readwrite("qz", &SpatialML::pose_frame::qz)
-      .def("as_se3", [](const SpatialML::pose_frame &pose) {
-        return pose.as_se3().matrix();
-      })
+      .def("as_se3", [](const SpatialML::pose_frame &pose) { return pose.as_se3().matrix(); })
       .def("__repr__", [](const SpatialML::pose_frame &p) {
         std::ostringstream ss;
         ss << p;
@@ -161,9 +160,8 @@ PYBIND11_MODULE(spatialmp4, m) {
       .def_readwrite("fy", &SpatialML::camera_intrinsics::fy)
       .def_readwrite("cx", &SpatialML::camera_intrinsics::cx)
       .def_readwrite("cy", &SpatialML::camera_intrinsics::cy)
-      .def("as_cvmat", [](const SpatialML::camera_intrinsics &intrinsics) {
-        return MatxToEigen(intrinsics.as_cvmat());
-      })
+      .def("as_cvmat",
+           [](const SpatialML::camera_intrinsics &intrinsics) { return MatxToEigen(intrinsics.as_cvmat()); })
       .def("__repr__", [](const SpatialML::camera_intrinsics &i) {
         std::ostringstream ss;
         ss << i;
@@ -179,12 +177,9 @@ PYBIND11_MODULE(spatialmp4, m) {
           [](SpatialML::camera_extrinsics &extrinsics, const Eigen::Matrix4d &matrix) {
             extrinsics.extrinsics = EigenToMatx(matrix);
           })
-      .def("as_cvmat", [](const SpatialML::camera_extrinsics &extrinsics) {
-        return MatxToEigen(extrinsics.as_cvmat());
-      })
-      .def("as_se3", [](const SpatialML::camera_extrinsics &extrinsics) {
-        return extrinsics.as_se3().matrix();
-      })
+      .def("as_cvmat",
+           [](const SpatialML::camera_extrinsics &extrinsics) { return MatxToEigen(extrinsics.as_cvmat()); })
+      .def("as_se3", [](const SpatialML::camera_extrinsics &extrinsics) { return extrinsics.as_se3().matrix(); })
       .def("__repr__", [](const SpatialML::camera_extrinsics &e) {
         std::ostringstream ss;
         ss << e;
diff --git a/examples/python/visualize_rerun.py b/examples/python/visualize_rerun.py
index fbae9a0..6a1fdfe 100644
--- a/examples/python/visualize_rerun.py
+++ b/examples/python/visualize_rerun.py
@@ -22,12 +22,15 @@ def pico_pose_to_open3d(extrinsic):
 def main(
     video_file: str,
     depth_only: bool = False,
+    rgb_only: bool = False,
 ):
     """Visualize spatialmp4 using rerun."""
     reader = sm.Reader(video_file)
 
     if depth_only:
         reader.set_read_mode(sm.ReadMode.DEPTH_ONLY)
+    elif rgb_only:
+        reader.set_read_mode(sm.ReadMode.RGB_ONLY)
     else:
         reader.set_read_mode(sm.ReadMode.DEPTH_FIRST)
 
@@ -91,6 +94,16 @@ def main(
             extrinsic = np.eye(4)
             extrinsic[:3, 3] = [TWH.x, TWH.y, TWH.z]
             extrinsic[:3, :3] = Rotation.from_quat((TWH.qx, TWH.qy, TWH.qz, TWH.qw)).as_matrix()
+        elif rgb_only:
+            frame_rgb = reader.load_rgb()
+            timestamp = frame_rgb.timestamp
+
+            __import__('ipdb').set_trace()
+            T_W_Hrgb = frame_rgb.pose.as_se3()
+            T_W_Irgb = sm.head_to_imu(T_W_Hrgb, sm.HEAD_MODEL_OFFSET)
+            T_W_Srgb = T_W_Irgb * T_I_Srgb
+            pass
+
         else:
             rgbd = reader.load_rgbd(True)
             timestamp = rgbd.timestamp

From 9abf1a5a293ae4528538e4d3badacc100ab4f657 Mon Sep 17 00:00:00 2001
From: "bingwen.ai" <bingwen.ai@bytedance.com>
Date: Thu, 6 Nov 2025 21:06:49 +0800
Subject: [PATCH 2/2] Fix sm.head_to_imu binding bug.

---
 bindings/spatialmp4.cpp            | 46 ++++++++++++---
 examples/python/visualize_rerun.py | 90 ++++++++++++++++++++++++------
 python/tests/test_utilities.py     | 17 ++++++
 3 files changed, 128 insertions(+), 25 deletions(-)

diff --git a/bindings/spatialmp4.cpp b/bindings/spatialmp4.cpp
index 8bacebb..b846223 100644
--- a/bindings/spatialmp4.cpp
+++ b/bindings/spatialmp4.cpp
@@ -307,7 +307,8 @@ PYBIND11_MODULE(spatialmp4, m) {
 
   m.def(
       "head_to_imu",
-      [](py::array_t<double> head_pose_array, py::array_t<double> head_model_offset_array) {
+      [](py::array_t<double, py::array::forcecast> head_pose_array,
+         py::array_t<double, py::array::forcecast> head_model_offset_array) {
         py::buffer_info pose_info = head_pose_array.request();
         if (pose_info.ndim != 2 || pose_info.shape[0] != 4 || pose_info.shape[1] != 4) {
           throw std::invalid_argument("head_pose must be a 4x4 matrix");
@@ -317,12 +318,36 @@ PYBIND11_MODULE(spatialmp4, m) {
           throw std::invalid_argument("head_model_offset must be a 3-element vector");
         }
 
-        Eigen::Map<const Eigen::Matrix<double, 4, 4, Eigen::RowMajor>> head_pose_map(
-            static_cast<double *>(pose_info.ptr));
-        Eigen::Map<const Eigen::Vector3d> head_model_offset_map(static_cast<double *>(offset_info.ptr));
+        Eigen::Matrix4d head_pose_matrix;
+        const auto row_stride = pose_info.strides[0] / static_cast<py::ssize_t>(sizeof(double));
+        const auto col_stride = pose_info.strides[1] / static_cast<py::ssize_t>(sizeof(double));
+        const double *pose_ptr = static_cast<const double *>(pose_info.ptr);
+        if (pose_info.strides[1] == static_cast<py::ssize_t>(sizeof(double))) {
+          Eigen::Map<const Eigen::Matrix<double, 4, 4, Eigen::RowMajor>> head_pose_map(pose_ptr);
+          head_pose_matrix = head_pose_map;
+        } else if (pose_info.strides[0] == static_cast<py::ssize_t>(sizeof(double))) {
+          Eigen::Map<const Eigen::Matrix<double, 4, 4, Eigen::ColMajor>> head_pose_map(pose_ptr);
+          head_pose_matrix = head_pose_map;
+        } else {
+          head_pose_matrix.setZero();
+          for (py::ssize_t i = 0; i < 4; ++i) {
+            for (py::ssize_t j = 0; j < 4; ++j) {
+              head_pose_matrix(i, j) = pose_ptr[i * row_stride + j * col_stride];
+            }
+          }
+        }
 
-        Eigen::Matrix4d head_pose_matrix = head_pose_map;
-        Eigen::Vector3d head_model_offset = head_model_offset_map;
+        Eigen::Vector3d head_model_offset;
+        const double *offset_ptr = static_cast<const double *>(offset_info.ptr);
+        if (offset_info.strides[0] == static_cast<py::ssize_t>(sizeof(double))) {
+          Eigen::Map<const Eigen::Vector3d> head_model_offset_map(offset_ptr);
+          head_model_offset = head_model_offset_map;
+        } else {
+          const auto offset_stride = offset_info.strides[0] / static_cast<py::ssize_t>(sizeof(double));
+          for (py::ssize_t i = 0; i < 3; ++i) {
+            head_model_offset[i] = offset_ptr[i * offset_stride];
+          }
+        }
 
         Eigen::Matrix3d R = head_pose_matrix.block<3, 3>(0, 0);
         Eigen::Vector3d t = head_pose_matrix.block<3, 1>(0, 3);
@@ -333,9 +358,14 @@ PYBIND11_MODULE(spatialmp4, m) {
         Sophus::SE3d imu_pose;
         Utilities::HeadToImu(head_pose, head_model_offset, imu_pose);
 
-        Eigen::Matrix<double, 4, 4, Eigen::RowMajor> result = imu_pose.matrix();
+        const Eigen::Matrix4d result = imu_pose.matrix();
         py::array_t<double> output({4, 4});
-        std::memcpy(output.mutable_data(), result.data(), sizeof(double) * 16);
+        auto mutable_result = output.mutable_unchecked<2>();
+        for (py::ssize_t i = 0; i < 4; ++i) {
+          for (py::ssize_t j = 0; j < 4; ++j) {
+            mutable_result(i, j) = result(i, j);
+          }
+        }
         return output;
       },
       py::arg("head_pose"), py::arg("head_model_offset"));
diff --git a/examples/python/visualize_rerun.py b/examples/python/visualize_rerun.py
index 6a1fdfe..323d0e8 100644
--- a/examples/python/visualize_rerun.py
+++ b/examples/python/visualize_rerun.py
@@ -2,6 +2,7 @@
 import os
 import cv2
 import numpy as np
+from typing import Optional
 from scipy.spatial.transform import Rotation
 import spatialmp4 as sm
 import rerun as rr
@@ -9,6 +10,7 @@
 
 
 def pico_pose_to_open3d(extrinsic):
+    extrinsic = np.array(extrinsic, dtype=np.float64, copy=True)
     convert = np.array([
         [0, 0, 1, 0],
         [1, 0, 0, 0],
@@ -19,12 +21,51 @@ def pico_pose_to_open3d(extrinsic):
     return output
 
 
+def ensure_right_handed_rotation(rotation_matrix: np.ndarray) -> np.ndarray:
+    r = np.array(rotation_matrix, dtype=np.float64, copy=True)
+    u, _, vh = np.linalg.svd(r)
+    corrected = u @ vh
+    if np.linalg.det(corrected) < 0:
+        u[:, -1] *= -1
+        corrected = u @ vh
+    return corrected
+
+
+def head_to_imu(head_pose: np.ndarray, head_model_offset: np.ndarray) -> np.ndarray:
+    """Convert head pose (T_W_H) into IMU pose (T_W_I) matching Utilities::HeadToImu."""
+    head_pose = np.asarray(head_pose, dtype=np.float64)
+    if head_pose.shape != (4, 4):
+        raise ValueError("head_pose must be a 4x4 matrix")
+
+    head_model_offset = np.asarray(head_model_offset, dtype=np.float64)
+    if head_model_offset.shape != (3,):
+        raise ValueError("head_model_offset must be a 3-element vector")
+
+    rotation_matrix = head_pose[:3, :3]
+    translation = head_pose[:3, 3].copy()
+
+    rotation = Rotation.from_matrix(rotation_matrix)
+    translation -= rotation.apply(head_model_offset)
+
+    quat_x, quat_y, quat_z, quat_w = rotation.as_quat()  # scipy returns xyzw
+    imu_quat = np.array([quat_y, -quat_x, quat_z, quat_w], dtype=np.float64)  # xyzw
+    imu_rotation = Rotation.from_quat(imu_quat).as_matrix()
+    imu_translation = np.array([translation[1], -translation[0], translation[2]], dtype=np.float64)
+
+    imu_pose = np.eye(4, dtype=np.float64)
+    imu_pose[:3, :3] = imu_rotation
+    imu_pose[:3, 3] = imu_translation
+    return imu_pose
+
+
 def main(
     video_file: str,
     depth_only: bool = False,
     rgb_only: bool = False,
+    topk: Optional[int] = typer.Option(None, help="Limit visualization to the first K frames."),
 ):
     """Visualize spatialmp4 using rerun."""
+
     reader = sm.Reader(video_file)
 
     if depth_only:
@@ -34,14 +75,14 @@ def main(
     else:
         reader.set_read_mode(sm.ReadMode.DEPTH_FIRST)
 
-    if not reader.has_depth():
+    if not rgb_only and not reader.has_depth():
         typer.echo(typer.style(f"No depth found in input file", fg=typer.colors.RED))
         return
     if not reader.has_pose():
-        typer.echo(typer.style(f"No depth found in input file", fg=typer.colors.RED))
+        typer.echo(typer.style(f"No pose found in input file", fg=typer.colors.RED))
         return
 
-    blueprint=rrb.Horizontal(
+    blueprint = rrb.Horizontal(
         rrb.Vertical(
             rrb.Spatial3DView(name="3D", origin="world"),
             rrb.TextDocumentView(name="Description", origin="/description"),
@@ -83,6 +124,8 @@ def main(
         cx = float(reader.get_rgb_intrinsics_left().cx)
         cy = float(reader.get_rgb_intrinsics_left().cy)
 
+    processed = 0
+
     while reader.has_next():
         if depth_only:
             depth_frame = reader.load_depth()
@@ -98,28 +141,31 @@ def main(
             frame_rgb = reader.load_rgb()
             timestamp = frame_rgb.timestamp
 
-            __import__('ipdb').set_trace()
+            T_I_Srgb = reader.get_rgb_extrinsics_left().as_se3()
             T_W_Hrgb = frame_rgb.pose.as_se3()
             T_W_Irgb = sm.head_to_imu(T_W_Hrgb, sm.HEAD_MODEL_OFFSET)
-            T_W_Srgb = T_W_Irgb * T_I_Srgb
-            pass
+            T_W_Srgb = T_W_Irgb @ T_I_Srgb
+            extrinsic = T_W_Srgb
 
         else:
             rgbd = reader.load_rgbd(True)
             timestamp = rgbd.timestamp
             depth_np = rgbd.depth
             extrinsic = rgbd.T_W_S
-        print(f"Loading frame {reader.get_index() + 1} / {reader.get_frame_count()}, timestamp: {timestamp}")
 
-        # preprocess on depthmap
-        depth_uint16 = (depth_np * 1000).astype(np.uint16)
-        sobelx = cv2.Sobel(depth_uint16, cv2.CV_32F, 1, 0, ksize=3)
-        sobely = cv2.Sobel(depth_uint16, cv2.CV_32F, 0, 1, ksize=3)
-        grad_mag = np.sqrt(sobelx**2 + sobely**2)
-        depth_np[grad_mag > 500] = 0
-        depth_np[(depth_np < 0.2) | (depth_np > 5)] = 0
+        print(f"Loading frame {reader.get_index() + 1} / {reader.get_frame_count()}, timestamp: {timestamp}")
+        
+        if not rgb_only:
+            # preprocess on depthmap
+            depth_uint16 = (depth_np * 1000).astype(np.uint16)
+            sobelx = cv2.Sobel(depth_uint16, cv2.CV_32F, 1, 0, ksize=3)
+            sobely = cv2.Sobel(depth_uint16, cv2.CV_32F, 0, 1, ksize=3)
+            grad_mag = np.sqrt(sobelx**2 + sobely**2)
+            depth_np[grad_mag > 500] = 0
+            depth_np[(depth_np < 0.2) | (depth_np > 5)] = 0
 
         extrinsic = pico_pose_to_open3d(extrinsic)
+        extrinsic[:3, :3] = ensure_right_handed_rotation(extrinsic[:3, :3])
 
         rr.set_time_seconds("time", timestamp)
         rr.log("world/xyz", rr.Arrows3D(vectors=[[1, 0, 0], [0, 1, 0], [0, 0, 1]], colors=[[255, 0, 0], [0, 255, 0], [0, 0, 255]]))
@@ -132,9 +178,19 @@ def main(
         position = extrinsic[:3, 3]
         rotation = Rotation.from_matrix(extrinsic[:3, :3]).as_quat()
         rr.log("world/camera", rr.Transform3D(translation=position, rotation=rr.Quaternion(xyzw=rotation)))
-        rr.log("world/camera/image/depth", rr.DepthImage(depth_np, meter=1.0))
-        if not depth_only:
-            rr.log("world/camera/image/rgb", rr.Image(rgbd.rgb, color_model="BGR").compress(jpeg_quality=95))
+
+        if rgb_only:
+            rr.log("world/camera/image/rgb", rr.Image(frame_rgb.left_rgb, color_model="BGR").compress(jpeg_quality=95))
+        else:
+            rr.log("world/camera/image/depth", rr.DepthImage(depth_np, meter=1.0))
+            if not depth_only:
+                rr.log("world/camera/image/rgb", rr.Image(rgbd.rgb, color_model="BGR").compress(jpeg_quality=95))
+
+        processed += 1
+        if topk is not None and processed >= topk:
+            break
+        if topk and reader.get_index() > topk:
+            break
 
 
 if __name__ == "__main__":#
diff --git a/python/tests/test_utilities.py b/python/tests/test_utilities.py
index 9706e50..a965fa4 100644
--- a/python/tests/test_utilities.py
+++ b/python/tests/test_utilities.py
@@ -85,3 +85,20 @@ def test_head_to_imu_general_pose():
     imu_pose = spatialmp4.head_to_imu(head_pose, offset)
     expected = compute_expected_head_to_imu(head_pose, offset)
     assert np.allclose(imu_pose, expected, atol=1e-10)
+
+
+def test_head_to_imu_fortran_order_input():
+    angle = np.deg2rad(30.0)
+    Rx = np.array([
+        [1.0, 0.0, 0.0],
+        [0.0, np.cos(angle), -np.sin(angle)],
+        [0.0, np.sin(angle), np.cos(angle)],
+    ])
+    head_pose = np.eye(4, order="F")
+    head_pose[:3, :3] = Rx
+    head_pose[:3, 3] = np.array([0.3, -0.4, 0.5])
+    offset = np.array([0.12, -0.08, 0.04])
+
+    expected = compute_expected_head_to_imu(np.array(head_pose), offset)
+    imu_pose = spatialmp4.head_to_imu(head_pose, offset)
+    assert np.allclose(imu_pose, expected, atol=1e-12)