From 718de1dd5ff9e6c8880641bdb5fc0d727d61e465 Mon Sep 17 00:00:00 2001
From: Shuheng Liu <wish1104@icloud.com>
Date: Fri, 22 May 2026 19:17:10 -0700
Subject: [PATCH 1/7] feat(datasets): mixed-frequency training + fps metadata
 for pi07

- DatasetMixtureConfig.action_freq is now Optional[float] and defaults
  to None, disabling resampling so each dataset is sampled at its
  native fps. resolve_delta_timestamps substitutes ds_meta.fps when
  action_freq is None so a chunk of N actions is N consecutive native
  frames regardless of the dataset's rate.
- New DatasetMixtureConfig.emit_fps (default True) makes __getitem__
  emit the dataset's native meta.fps as a torch.long sample key
  (always non-pad, no participation in metadata_drop_*_prob).
- All four pi07 / pi07_paligemma prepare_metadata implementations now
  tokenize fps with the lowercase "fps: N, " header between Robot: and
  Control:, omitted entirely when fps_is_pad is True.
- New EnvMetadataConfig.emit_fps (default True) makes add_eval_metadata
  broadcast cfg.env.fps as the inference-time fps metadata key,
  matching the training contract end-to-end.
---
 src/opentau/configs/default.py                |  37 ++++-
 src/opentau/datasets/factory.py               |   6 +
 src/opentau/datasets/lerobot_dataset.py       |  20 +++
 src/opentau/envs/configs.py                   |   7 +
 src/opentau/envs/utils.py                     |  18 ++-
 .../modeling_pi07_high_level.py               |  19 ++-
 .../pi07/low_level/modeling_pi07_low_level.py |  19 ++-
 .../modeling_pi07_high_level.py               |  14 +-
 .../low_level/modeling_pi07_low_level.py      |  62 ++++++--
 tests/configs/test_default.py                 |  35 ++++-
 tests/datasets/test_delta_timestamps.py       |  76 ++++++++++
 tests/datasets/test_optional_keys.py          |  84 ++++++++++-
 tests/envs/test_configs.py                    |  17 +++
 tests/envs/test_utils.py                      |  91 ++++++++++-
 tests/policies/test_pi07_cpu.py               | 142 ++++++++++++++++++
 .../test_pi07_paligemma_high_level_planner.py | 100 ++++++++++++
 .../policies/test_pi07_paligemma_low_level.py | 127 ++++++++++++++++
 17 files changed, 828 insertions(+), 46 deletions(-)

diff --git a/src/opentau/configs/default.py b/src/opentau/configs/default.py
index d835f8d9..19b723ae 100644
--- a/src/opentau/configs/default.py
+++ b/src/opentau/configs/default.py
@@ -174,7 +174,14 @@ class DatasetMixtureConfig:
             same length as `datasets` when provided. If None, weights are inferred
             from dataset lengths. Defaults to None.
         action_freq: Frequency at which actions from the dataset mixture are
-            resampled, in Hz. Defaults to 30.0.
+            resampled, in Hz. ``None`` (default) disables resampling — each
+            dataset is sampled at its native fps, so a single batch can mix
+            samples from sources running at different rates (predicting
+            ``chunk_size`` consecutive native frames per sample). Set a
+            positive float to resample every dataset in the mixture to that
+            common rate via nearest-neighbor frame selection. When using
+            ``None``, prefer also setting ``emit_fps=True`` so the policy
+            can condition on the per-sample rate.
         image_resample_strategy: Resample strategy for image features. Must be
             one of 'linear' or 'nearest'. Defaults to 'nearest'.
         vector_resample_strategy: Resample strategy for non-image features, such
@@ -227,6 +234,15 @@ class DatasetMixtureConfig:
             must have a non-empty ``control_mode`` after the optional
             ``DatasetConfig.control_mode`` override has been applied. Defaults
             to ``False`` (empty / missing values are allowed).
+        emit_fps: Whether ``__getitem__`` returns the dataset's native
+            ``meta.fps`` as the ``fps`` metadata key (``torch.long`` scalar,
+            paired with ``fps_is_pad=False``). Default ``True``. When
+            ``False``, neither ``fps`` nor ``fps_is_pad`` appears in the
+            sample dict and the pi07 / pi07_paligemma policy prefix omits
+            the ``fps:`` segment. Unlike the other metadata fields, ``fps``
+            is **not** rolled by ``metadata_drop_*_prob`` — it's an intrinsic
+            property of the dataset, not a noisy label, so it is always
+            present when ``emit_fps=True``.
         tolerance_s: Mixture-wide default tolerance (in seconds) for the
             load-time ``check_timestamps_sync`` call inside
             ``LeRobotDataset.__init__``. Each dataset's frame-to-frame
@@ -254,9 +270,9 @@ class DatasetMixtureConfig:
 
     Raises:
         ValueError: If `weights` is provided and its length doesn't match
-            `datasets`, if `action_freq` is not positive, if resample
-            strategies are invalid, or if any drop probability is outside
-            ``[0, 1]``.
+            `datasets`, if `action_freq` is not None and not positive, if
+            resample strategies are invalid, or if any drop probability is
+            outside ``[0, 1]``.
     """
 
     # List of dataset configs to be used in the mixture.
@@ -265,7 +281,8 @@ class DatasetMixtureConfig:
     # Must be the same length as `datasets` when provided.
     weights: list[float] | None = None
     # Frequency at which the actions from dataset mixture are resampled, in Hz.
-    action_freq: float = 30.0
+    # ``None`` disables resampling — each dataset is sampled at its native fps.
+    action_freq: float | None = None
     # Resample strategy for image features
     image_resample_strategy: str = "nearest"
     # Resample strategy for non-image features, such as action or state
@@ -297,6 +314,12 @@ class DatasetMixtureConfig:
     require_non_empty_robot_type: bool = False
     require_non_empty_control_mode: bool = False
 
+    # Whether `__getitem__` emits the dataset's native fps as the `fps`
+    # metadata key. Independent of `metadata_drop_*_prob` — fps is intrinsic
+    # to the dataset, not a noisy label, so it is always present (never
+    # padded) when this is True.
+    emit_fps: bool = True
+
     # Mixture-wide defaults for the load-time timestamp-sync check. Each
     # dataset can override these via `DatasetConfig.{tolerance_s,
     # skip_timestamp_check}`. The default tolerance matches
@@ -308,8 +331,8 @@ def __post_init__(self):
         """Validate dataset mixture configuration."""
         if self.weights is not None and len(self.datasets) != len(self.weights):
             raise ValueError("The length of `weights` must match the length of `datasets`.")
-        if self.action_freq <= 0:
-            raise ValueError(f"`action_freq` must be a positive number, got {self.action_freq}.")
+        if self.action_freq is not None and self.action_freq <= 0:
+            raise ValueError(f"`action_freq` must be a positive number or None, got {self.action_freq}.")
         if self.image_resample_strategy not in ["linear", "nearest"]:
             raise ValueError(
                 f"`image_resample_strategy` must be one of ['linear', 'nearest'], got {self.image_resample_strategy}."
diff --git a/src/opentau/datasets/factory.py b/src/opentau/datasets/factory.py
index 3cfbddf4..33d42583 100644
--- a/src/opentau/datasets/factory.py
+++ b/src/opentau/datasets/factory.py
@@ -131,6 +131,12 @@ def resolve_delta_timestamps(
     """
     delta_timestamps: dict[str, list[float]] = {}
     action_freq = cfg.dataset_mixture.action_freq
+    # Mixed-frequency training: `action_freq=None` opts out of resampling.
+    # Substituting `ds_meta.fps` makes every delta-timestamp land exactly on
+    # this dataset's native frame boundaries, so nearest-neighbor sampling is
+    # a no-op and consecutive frames are returned unchanged.
+    if action_freq is None:
+        action_freq = ds_meta.fps
 
     if dataset_cfg.repo_id is None:
         raise ValueError("dataset_cfg.repo_id must not be None when resolving delta timestamps.")
diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py
index 2abf6c14..9b3303ae 100644
--- a/src/opentau/datasets/lerobot_dataset.py
+++ b/src/opentau/datasets/lerobot_dataset.py
@@ -698,6 +698,10 @@ def __init__(self, cfg: TrainPipelineConfig):
         self.response_drop_prob = dm.response_drop_prob if dm else 0.0
         self.metadata_drop_all_prob = dm.metadata_drop_all_prob if dm else 0.0
         self.metadata_drop_each_prob = dm.metadata_drop_each_prob if dm else 0.0
+        # Whether `_emit_optional_keys` injects the dataset's native `meta.fps`
+        # under the `fps` / `fps_is_pad` keys. Independent of the dropout rolls
+        # above — fps is intrinsic to the dataset, not a noisy label.
+        self.emit_fps = dm.emit_fps if dm else True
         # Whether the above drop rolls actually fire. `make_dataset` flips this
         # off on the validation subset (unless `val_enable_optional_key_dropout`
         # is set). Subgoal *frame* selection (end-of-segment vs. uniform window)
@@ -875,6 +879,15 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None:
         pad signal — a consumer seeing ``""`` can assume the field was
         unavailable or was masked this step.
 
+        ``fps`` (the dataset's native frame rate) is emitted as a
+        ``torch.long`` scalar alongside ``fps_is_pad`` when
+        ``self.emit_fps`` is True. Unlike the other metadata fields, ``fps``
+        does **not** participate in the dropout rolls — it's an intrinsic
+        property of the dataset, not a noisy label — so it's always
+        non-padded when emitted. Set ``emit_fps=False`` on the mixture to
+        omit the keys entirely (the policy's ``prepare_metadata`` falls
+        through to its default-pad path).
+
         Dropout order:
             1. ``history_state_drop_prob``: zero ``state`` and historical camera
                frames; mark ``obs_history_is_pad`` all True.
@@ -885,6 +898,7 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None:
             4. ``metadata_drop_all_prob``: mask {speed, mistake, quality,
                robot_type, control_mode} together. If this didn't fire,
                ``metadata_drop_each_prob`` rolls independently for each field.
+               ``fps`` is **not** included in these rolls.
 
         Dropout rolls use the default torch RNG (auto-seeded per worker).
 
@@ -975,6 +989,12 @@ def _roll(prob: float) -> bool:
             drop_this = drop_meta_all or _roll(self.metadata_drop_each_prob)
             standard_item[key] = "" if drop_this else val
 
+        # (6) Native fps: intrinsic dataset property, not a noisy label. Emitted
+        # unconditionally (no dropout) when `emit_fps` is enabled on the mixture.
+        if self.emit_fps:
+            standard_item["fps"] = torch.tensor(int(self.meta.fps), dtype=torch.long)
+            standard_item["fps_is_pad"] = torch.tensor(False)
+
     def resize_with_pad(self, img, width, height, pad_value=0) -> torch.Tensor:
         """Resize an image to target dimensions with padding.
 
diff --git a/src/opentau/envs/configs.py b/src/opentau/envs/configs.py
index b47b246c..57bb09cd 100644
--- a/src/opentau/envs/configs.py
+++ b/src/opentau/envs/configs.py
@@ -76,6 +76,12 @@ class EnvMetadataConfig:
             ``None``.
         control_mode: ``"joint"`` (joint-position control) or ``"ee"``
             (end-effector control), or ``None``.
+        emit_fps: Whether to broadcast :attr:`EnvConfig.fps` as the ``fps``
+            metadata field at inference (paralleling
+            :attr:`DatasetMixtureConfig.emit_fps` at training time).
+            Defaults to ``True``. When ``False``, the policy prefix omits
+            the ``fps:`` segment — useful when resuming a checkpoint
+            trained without fps conditioning.
     """
 
     speed: int | None = None
@@ -83,6 +89,7 @@ class EnvMetadataConfig:
     mistake: bool | None = None
     robot_type: str | None = None
     control_mode: ControlMode | None = None
+    emit_fps: bool = True
 
     def __post_init__(self) -> None:
         # `isinstance(x, bool)` guards exclude Python bools — `bool` is a
diff --git a/src/opentau/envs/utils.py b/src/opentau/envs/utils.py
index 387c2187..2af6c749 100644
--- a/src/opentau/envs/utils.py
+++ b/src/opentau/envs/utils.py
@@ -172,13 +172,23 @@ def add_eval_metadata(observation: dict[str, Any], cfg: TrainPipelineConfig) ->
     broadcast as ``list[str]`` of length ``num_envs``. Fields set to
     ``None`` on the config are skipped, so the corresponding batch key
     stays absent and the policy's ``prepare_metadata`` pad default kicks
-    in. ``cfg.env`` is guaranteed non-``None`` here — ``eval_policy``
+    in.
+
+    ``fps`` is the env's stepping frequency (:attr:`EnvConfig.fps`,
+    e.g. ``20`` for LIBERO) and is broadcast as a ``torch.long`` tensor
+    with ``fps_is_pad=False`` whenever ``meta.emit_fps`` is ``True``
+    (the default). Set ``emit_fps=False`` on ``EnvMetadataConfig`` to
+    skip it — useful when resuming a checkpoint trained without fps
+    conditioning.
+
+    ``cfg.env`` is guaranteed non-``None`` here — ``eval_policy``
     dereferences ``cfg.env.type`` before the rollout loop starts.
 
     Args:
         observation: Observation dict produced by ``preprocess_observation``
             (must contain ``state`` to source the batch size and device).
-        cfg: Training/eval pipeline config; reads ``cfg.env.metadata``.
+        cfg: Training/eval pipeline config; reads ``cfg.env.metadata`` and
+            ``cfg.env.fps``.
 
     Returns:
         The observation dict, mutated in place.
@@ -203,6 +213,10 @@ def add_eval_metadata(observation: dict[str, Any], cfg: TrainPipelineConfig) ->
         if val is not None:
             observation[key] = [val] * batch_size
 
+    if meta.emit_fps:
+        observation["fps"] = torch.tensor([cfg.env.fps] * batch_size, dtype=torch.long, device=device)
+        observation["fps_is_pad"] = torch.zeros(batch_size, dtype=torch.bool, device=device)
+
     return observation
 
 
diff --git a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py
index b15eee19..d320bce6 100644
--- a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py
+++ b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py
@@ -688,12 +688,12 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
 
         Args:
             batch: Batch dict that may contain any of:
-                ``"speed"``, ``"quality"``, ``"mistake"`` (float tensors with
-                a corresponding ``_is_pad`` bool tensor — entries marked as
-                pad are dropped), and ``"robot_type"``, ``"control_mode"``
-                (lists of strings — empty string is the pad signal, no
-                separate ``_is_pad`` flag). Missing keys are treated as
-                fully padded.
+                ``"speed"``, ``"quality"``, ``"mistake"``, ``"fps"`` (numeric
+                tensors with a corresponding ``_is_pad`` bool tensor —
+                entries marked as pad are dropped), and ``"robot_type"``,
+                ``"control_mode"`` (lists of strings — empty string is the
+                pad signal, no separate ``_is_pad`` flag). Missing keys are
+                treated as fully padded.
 
         Returns:
             A tuple ``(metadata_tokens, metadata_masks)`` with shapes
@@ -711,6 +711,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
             mistake_is_pad,
             robot_type,
             control_mode,
+            fps,
+            fps_is_pad,
         ) in zip(
             batch.get("speed", torch.zeros(batch_size, dtype=torch.float32)),
             batch.get("quality", torch.zeros(batch_size, dtype=torch.float32)),
@@ -720,6 +722,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
             batch.get("mistake_is_pad", torch.ones(batch_size, dtype=torch.bool)),
             batch.get("robot_type", [""] * batch_size),
             batch.get("control_mode", [""] * batch_size),
+            batch.get("fps", torch.zeros(batch_size, dtype=torch.long)),
+            batch.get("fps_is_pad", torch.ones(batch_size, dtype=torch.bool)),
             strict=True,
         ):
             segments = []
@@ -735,6 +739,9 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
             if robot_type:
                 segments.append(f"Robot: {robot_type}, ")
 
+            if not fps_is_pad:
+                segments.append(f"fps: {str(fps.item())}, ")
+
             if control_mode:
                 segments.append(f"Control: {control_mode}, ")
 
diff --git a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py
index 58bbc2dd..6ca13d3c 100644
--- a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py
+++ b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py
@@ -1087,12 +1087,12 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
 
         Args:
             batch: Batch dict that may contain any of:
-                ``"speed"``, ``"quality"``, ``"mistake"`` (float tensors with
-                a corresponding ``_is_pad`` bool tensor — entries marked as
-                pad are dropped), and ``"robot_type"``, ``"control_mode"``
-                (lists of strings — empty string is the pad signal, no
-                separate ``_is_pad`` flag). Missing keys are treated as
-                fully padded.
+                ``"speed"``, ``"quality"``, ``"mistake"``, ``"fps"`` (numeric
+                tensors with a corresponding ``_is_pad`` bool tensor —
+                entries marked as pad are dropped), and ``"robot_type"``,
+                ``"control_mode"`` (lists of strings — empty string is the
+                pad signal, no separate ``_is_pad`` flag). Missing keys are
+                treated as fully padded.
 
         Returns:
             A tuple ``(metadata_tokens, metadata_masks)`` with shapes
@@ -1111,6 +1111,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
             mistake_is_pad,
             robot_type,
             control_mode,
+            fps,
+            fps_is_pad,
         ) in zip(
             batch.get("speed", torch.zeros(batch_size, dtype=torch.float32)),
             batch.get("quality", torch.zeros(batch_size, dtype=torch.float32)),
@@ -1120,6 +1122,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
             batch.get("mistake_is_pad", torch.ones(batch_size, dtype=torch.bool)),
             batch.get("robot_type", [""] * batch_size),
             batch.get("control_mode", [""] * batch_size),
+            batch.get("fps", torch.zeros(batch_size, dtype=torch.long)),
+            batch.get("fps_is_pad", torch.ones(batch_size, dtype=torch.bool)),
             strict=True,
         ):
             segments = []
@@ -1135,6 +1139,9 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
             if robot_type:
                 segments.append(f"Robot: {robot_type}, ")
 
+            if not fps_is_pad:
+                segments.append(f"fps: {str(fps.item())}, ")
+
             if control_mode:
                 segments.append(f"Control: {control_mode}, ")
 
diff --git a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py
index 7164794b..686e4f38 100644
--- a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py
+++ b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py
@@ -665,17 +665,24 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
         """Tokenizes the metadata for training.
 
         Wraps each metadata string with an ``<eos>`` suffix, then tokenizes and
-        pads to ``metadata_max_length``.
+        pads to ``metadata_max_length``. Numeric fields (``speed`` / ``quality``
+        / ``mistake`` / ``fps``) are paired with ``_is_pad`` flags — pad-True
+        entries are omitted from the segment string. ``fps`` is the dataset's
+        native frame rate (``torch.long``); when absent or padded the
+        ``fps:`` segment is dropped (no header, no comma).
         """
 
         metadata = []
-        for speed, quality, mistake, speed_is_pad, quality_is_pad, mistake_is_pad in zip(
+        batch_size = batch["state"].shape[0]
+        for speed, quality, mistake, speed_is_pad, quality_is_pad, mistake_is_pad, fps, fps_is_pad in zip(
             batch["speed"],
             batch["quality"],
             batch["mistake"],
             batch["speed_is_pad"],
             batch["quality_is_pad"],
             batch["mistake_is_pad"],
+            batch.get("fps", torch.zeros(batch_size, dtype=torch.long)),
+            batch.get("fps_is_pad", torch.ones(batch_size, dtype=torch.bool)),
             strict=True,
         ):
             segments = []
@@ -688,6 +695,9 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
             if not mistake_is_pad:
                 segments.append(f"Mistake: {str(mistake.item())}, ")
 
+            if not fps_is_pad:
+                segments.append(f"fps: {str(fps.item())}, ")
+
             metadata.append(f"Metadata: {' '.join(segments)}")
 
         device = batch["state"].device
diff --git a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py
index 44fa9600..831585a6 100644
--- a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py
+++ b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py
@@ -1305,6 +1305,12 @@ def _hydrate_metadata_batch(self, batch: dict) -> None:
           specified explicitly.
         - ``robot_type`` / ``control_mode`` are string lists (empty string is
           the pad signal, no separate ``_is_pad`` flag) and default to ``[""]``.
+        - ``fps`` (the dataset's native frame rate, ``torch.long``) defaults
+          to zeros and ``fps_is_pad`` defaults to ``True`` when the keys
+          are absent. The dataloader emits ``fps`` unconditionally (no
+          dropout) when ``DatasetMixtureConfig.emit_fps=True``; inference
+          batches built by ``add_eval_metadata`` set it when
+          ``EnvMetadataConfig.emit_fps=True``.
 
         Per-sample on/off is controlled only by ``*_is_pad`` (or the empty
         string for ``robot_type`` / ``control_mode``) after hydration; varying
@@ -1328,6 +1334,10 @@ def _hydrate_metadata_batch(self, batch: dict) -> None:
             batch["robot_type"] = [""] * bsz
         if "control_mode" not in batch:
             batch["control_mode"] = [""] * bsz
+        if "fps" not in batch:
+            batch["fps"] = torch.zeros(bsz, dtype=torch.long, device=dev)
+        if "fps_is_pad" not in batch:
+            batch["fps_is_pad"] = torch.ones(bsz, dtype=torch.bool, device=dev)
 
     def _hydrate_optional_conditioning_batch(self, batch: dict) -> None:
         """Ensure response / metadata keys exist (training + inference parity).
@@ -1375,11 +1385,16 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
         """Tokenize episode metadata into PaliGemma token IDs.
 
         Builds strings ``Metadata: Speed: … Quality: … Mistake: … Robot: …
-        Control: …`` only from fields whose ``*_is_pad`` flag is ``False``
-        (``robot_type`` / ``control_mode`` are strings, omitted when empty).
-        For each sample, a padded or empty field is omitted entirely (not
-        concatenated to ``segments``). When every field is pad, the row is
-        ``""`` before tokenization.
+        fps: … Control: …`` only from fields whose ``*_is_pad`` flag is
+        ``False`` (``robot_type`` / ``control_mode`` are strings, omitted
+        when empty). For each sample, a padded or empty field is omitted
+        entirely (not concatenated to ``segments``). When every field is
+        pad, the row is ``""`` before tokenization.
+
+        ``fps`` is the dataset's native frame rate (``torch.long``); the
+        segment is omitted when ``fps_is_pad`` is True. Segment ordering is
+        ``Speed → Quality → Mistake → Robot → fps → Control`` to match the
+        other pi07 / pi07_paligemma ``prepare_metadata`` implementations.
 
         Always runs :meth:`_hydrate_metadata_batch` first so callers see the same
         outcomes whether they hit :meth:`sample_actions` (often no metadata
@@ -1390,14 +1405,15 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
         scalar tensors broadcast like ``subgoal_is_pad``.
 
         .. note::
-            ``speed_is_pad`` / ``quality_is_pad`` / ``mistake_is_pad`` each
-            default to ``torch.ones(B, dtype=bool)`` (treat-as-pad) when the
-            key is missing from ``batch``. This is a behavior change from
-            the previous treat-as-real (``torch.zeros``) default. Hand-built
-            inference batches that supply ``"speed"`` / ``"quality"`` /
-            ``"mistake"`` but omit the corresponding ``_is_pad`` flag will
-            now have those metadata fields silently dropped from the prefix
-            string — pass ``..._is_pad=torch.zeros(...)`` explicitly when the
+            ``speed_is_pad`` / ``quality_is_pad`` / ``mistake_is_pad`` /
+            ``fps_is_pad`` each default to ``torch.ones(B, dtype=bool)``
+            (treat-as-pad) when the key is missing from ``batch``. This is
+            a behavior change from the previous treat-as-real
+            (``torch.zeros``) default. Hand-built inference batches that
+            supply ``"speed"`` / ``"quality"`` / ``"mistake"`` / ``"fps"``
+            but omit the corresponding ``_is_pad`` flag will now have
+            those metadata fields silently dropped from the prefix string
+            — pass ``..._is_pad=torch.zeros(...)`` explicitly when the
             metadata is real.
 
         Args:
@@ -1423,6 +1439,18 @@ def _row_float(key: str) -> Tensor:
                 )
             return t
 
+        def _row_long(key: str) -> Tensor:
+            """``(B,)`` long-int row (``fps``), scalar-broadcastable."""
+            t = batch[key]
+            t = torch.as_tensor(t, dtype=torch.long, device=device).reshape(-1)
+            if t.numel() == 1:
+                t = t.expand(b)
+            elif t.shape[0] != b:
+                raise ValueError(
+                    f"{key} must have shape ({b},) or be scalar broadcastable; got {tuple(t.shape)}."
+                )
+            return t
+
         def _row_bool(key: str) -> Tensor:
             """``(B,)`` boolean row (``mistake`` or ``*_is_pad``), scalar-broadcastable."""
             t = batch[key]
@@ -1438,9 +1466,11 @@ def _row_bool(key: str) -> Tensor:
         speed_t = _row_float("speed")
         quality_t = _row_float("quality")
         mistake_t = _row_bool("mistake")
+        fps_t = _row_long("fps")
         pad_speed = _row_bool("speed_is_pad")
         pad_quality = _row_bool("quality_is_pad")
         pad_mistake = _row_bool("mistake_is_pad")
+        pad_fps = _row_bool("fps_is_pad")
         robot_types = batch["robot_type"]
         control_modes = batch["control_mode"]
 
@@ -1454,6 +1484,8 @@ def _row_bool(key: str) -> Tensor:
             mistake_is_pad,
             robot_type,
             control_mode,
+            fps,
+            fps_is_pad,
         ) in zip(
             speed_t,
             quality_t,
@@ -1463,6 +1495,8 @@ def _row_bool(key: str) -> Tensor:
             pad_mistake,
             robot_types,
             control_modes,
+            fps_t,
+            pad_fps,
             strict=True,
         ):
             segments: list[str] = []
@@ -1477,6 +1511,8 @@ def _row_bool(key: str) -> Tensor:
                 segments.append(f"Mistake: {str(mistake.item())}, ")
             if robot_type:
                 segments.append(f"Robot: {robot_type}, ")
+            if not fps_is_pad.item():
+                segments.append(f"fps: {str(fps.item())}, ")
             if control_mode:
                 segments.append(f"Control: {control_mode}, ")
             metadata_rows.append(f"Metadata: {' '.join(segments)}" if segments else "")
diff --git a/tests/configs/test_default.py b/tests/configs/test_default.py
index 02e20c54..ef544129 100644
--- a/tests/configs/test_default.py
+++ b/tests/configs/test_default.py
@@ -74,10 +74,43 @@ def test_invalid_action_freq_raises_error(invalid_freq):
     """
     Tests that a ValueError is raised if action_freq is zero or negative.
     """
-    with pytest.raises(ValueError, match=f"`action_freq` must be a positive number, got {invalid_freq}."):
+    with pytest.raises(
+        ValueError,
+        match=rf"`action_freq` must be a positive number or None, got {invalid_freq}\.",
+    ):
         DatasetMixtureConfig(action_freq=invalid_freq)
 
 
+def test_action_freq_defaults_to_none():
+    """`action_freq=None` is the new default — each dataset trains at its native fps."""
+    cfg = DatasetMixtureConfig()
+    assert cfg.action_freq is None
+
+
+def test_action_freq_none_is_valid():
+    """Explicit `None` is accepted and disables resampling."""
+    cfg = DatasetMixtureConfig(action_freq=None)
+    assert cfg.action_freq is None
+
+
+def test_action_freq_positive_still_valid():
+    """A positive float still pins every dataset to that common rate."""
+    cfg = DatasetMixtureConfig(action_freq=30.0)
+    assert cfg.action_freq == 30.0
+
+
+def test_emit_fps_defaults_to_true():
+    """The new `emit_fps` toggle defaults to True so the policy gets fps conditioning out of the box."""
+    cfg = DatasetMixtureConfig()
+    assert cfg.emit_fps is True
+
+
+def test_emit_fps_can_be_disabled():
+    """Setting `emit_fps=False` should be accepted (legacy-checkpoint resume escape hatch)."""
+    cfg = DatasetMixtureConfig(emit_fps=False)
+    assert cfg.emit_fps is False
+
+
 def test_invalid_image_resample_strategy_raises_error():
     """
     Tests that a ValueError is raised for an unsupported image_resample_strategy.
diff --git a/tests/datasets/test_delta_timestamps.py b/tests/datasets/test_delta_timestamps.py
index 142ec790..58af8874 100644
--- a/tests/datasets/test_delta_timestamps.py
+++ b/tests/datasets/test_delta_timestamps.py
@@ -318,6 +318,82 @@ def test_resolve_delta_timestamps_only_state_feature(train_pipeline_config, data
     assert dt_mean["state"] == [0.0]
 
 
+def test_resolve_delta_timestamps_native_fps_substitution(
+    train_pipeline_config, dataset_config, lerobot_dataset_metadata
+):
+    """`action_freq=None` swaps in the dataset's native fps.
+
+    With chunk-style action indices and a mismatched native fps, the
+    resulting per-action timestamps must land on native-fps frame
+    boundaries (``i / ds_meta.fps``) — so nearest-neighbor resampling is
+    a no-op and consecutive frames are returned unchanged.
+    """
+    # `action_delta_indices` is a property derived from `chunk_size`
+    # (see PI0Config.action_delta_indices = list(range(chunk_size))), so drive
+    # the test via `chunk_size` rather than monkey-patching the property.
+    train_pipeline_config.policy.chunk_size = 4
+    train_pipeline_config.dataset_mixture.action_freq = None
+
+    # Inject the action feature into the metadata so it gets picked up.
+    lerobot_dataset_metadata.features.update({"actions": {"dtype": "float32", "shape": [32]}})
+    # Pin native fps to something distinct from the legacy default (30) so the
+    # substitution is observable in the output.
+    lerobot_dataset_metadata.fps = 50
+
+    dt_mean, _, _, _ = resolve_delta_timestamps(
+        train_pipeline_config, dataset_config, lerobot_dataset_metadata
+    )
+
+    # Each delta-timestamp value equals `i / native_fps`.
+    expected = [i / 50 for i in range(4)]
+    assert np.allclose(dt_mean["actions"], expected)
+
+
+def test_resolve_delta_timestamps_native_fps_differs_per_dataset(
+    train_pipeline_config, dataset_config, lerobot_dataset_metadata
+):
+    """Two datasets in the same mixture with different native fps must
+    produce different timestamps when action_freq is None.
+
+    This is the core invariant of mixed-frequency training: a 30 Hz source
+    and a 50 Hz source see chunk timestamps scaled to their respective
+    rates, so nearest-neighbor sampling lands on consecutive native frames
+    in both cases.
+    """
+    train_pipeline_config.policy.chunk_size = 3
+    train_pipeline_config.dataset_mixture.action_freq = None
+    lerobot_dataset_metadata.features.update({"actions": {"dtype": "float32", "shape": [32]}})
+
+    lerobot_dataset_metadata.fps = 30
+    dt_a, _, _, _ = resolve_delta_timestamps(train_pipeline_config, dataset_config, lerobot_dataset_metadata)
+
+    lerobot_dataset_metadata.fps = 50
+    dt_b, _, _, _ = resolve_delta_timestamps(train_pipeline_config, dataset_config, lerobot_dataset_metadata)
+
+    assert np.allclose(dt_a["actions"], [0 / 30, 1 / 30, 2 / 30])
+    assert np.allclose(dt_b["actions"], [0 / 50, 1 / 50, 2 / 50])
+    # Sanity check: the two are genuinely different (not just numerically
+    # close by accident).
+    assert not np.allclose(dt_a["actions"], dt_b["actions"])
+
+
+def test_resolve_delta_timestamps_action_freq_overrides_native_fps(
+    train_pipeline_config, dataset_config, lerobot_dataset_metadata
+):
+    """When `action_freq` is set (non-None), the native fps is ignored."""
+    train_pipeline_config.policy.chunk_size = 3
+    train_pipeline_config.dataset_mixture.action_freq = 30.0
+    lerobot_dataset_metadata.features.update({"actions": {"dtype": "float32", "shape": [32]}})
+    lerobot_dataset_metadata.fps = 50  # different native fps — must be overridden
+
+    dt_mean, _, _, _ = resolve_delta_timestamps(
+        train_pipeline_config, dataset_config, lerobot_dataset_metadata
+    )
+
+    # Timestamps reflect the configured `action_freq=30`, not native fps=50.
+    assert np.allclose(dt_mean["actions"], [0 / 30, 1 / 30, 2 / 30])
+
+
 def test_resolve_delta_timestamps_other_features_ignored(
     train_pipeline_config, dataset_config, lerobot_dataset_metadata
 ):
diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py
index c199ccc0..54d67071 100644
--- a/tests/datasets/test_optional_keys.py
+++ b/tests/datasets/test_optional_keys.py
@@ -64,7 +64,9 @@ def __init__(
         response_drop_prob=0.0,
         metadata_drop_all_prob=0.0,
         metadata_drop_each_prob=0.0,
+        emit_fps=True,
         meta_info: dict | None = None,
+        meta_fps: int = 30,
     ):
         torch.utils.data.Dataset.__init__(self)
         self.resolution = resolution
@@ -79,10 +81,16 @@ def __init__(
         self.response_drop_prob = response_drop_prob
         self.metadata_drop_all_prob = metadata_drop_all_prob
         self.metadata_drop_each_prob = metadata_drop_each_prob
+        self.emit_fps = emit_fps
         self.enable_optional_key_dropout = True
         # Stub meta surface so _to_standard_data_format can read robot_type /
         # control_mode without instantiating a real LeRobotDatasetMetadata.
-        self.meta = SimpleNamespace(info=meta_info if meta_info is not None else {})
+        # `fps` mirrors `LeRobotDatasetMetadata.fps` so `_emit_optional_keys`
+        # can populate the new `fps` / `fps_is_pad` sample keys.
+        self.meta = SimpleNamespace(
+            info=meta_info if meta_info is not None else {},
+            fps=meta_fps,
+        )
 
     def _get_feature_mapping_key(self) -> str:
         return _TEST_MAPPING_KEY
@@ -682,3 +690,77 @@ def test_emitted_as_python_strings_for_default_collate(self):
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert type(out["robot_type"]) is str
         assert type(out["control_mode"]) is str
+
+
+# `fps` is emitted as an intrinsic dataset property — always present (non-pad)
+# when `emit_fps=True`, omitted entirely when `emit_fps=False`. Unlike speed /
+# quality / mistake / robot_type / control_mode it does NOT participate in any
+# of the `metadata_drop_*_prob` dropout rolls.
+
+
+class TestFpsEmission:
+    def test_fps_emitted_with_meta_fps_value(self):
+        ds = _DummyBaseDataset(num_cams=1, meta_fps=20)
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert out["fps"].dtype == torch.long
+        assert out["fps"].item() == 20
+        assert out["fps_is_pad"].item() is False
+
+    def test_fps_emitted_as_torch_long_dtype(self):
+        ds = _DummyBaseDataset(num_cams=1, meta_fps=50)
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert out["fps"].dtype == torch.long
+
+    def test_fps_omitted_when_emit_fps_false(self):
+        ds = _DummyBaseDataset(num_cams=1, emit_fps=False, meta_fps=20)
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert "fps" not in out
+        assert "fps_is_pad" not in out
+
+    def test_fps_unaffected_by_metadata_drop_all_prob(self):
+        # fps is intrinsic to the dataset, not a noisy label — it must NOT
+        # participate in `metadata_drop_*_prob` rolls.
+        ds = _DummyBaseDataset(
+            num_cams=1,
+            meta_fps=15,
+            metadata_drop_all_prob=1.0,
+            metadata_drop_each_prob=1.0,
+        )
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        # Other metadata fields are dropped — but fps stays.
+        assert out["speed_is_pad"].item() is True
+        assert out["quality_is_pad"].item() is True
+        assert out["mistake_is_pad"].item() is True
+        assert out["robot_type"] == ""
+        assert out["control_mode"] == ""
+        assert out["fps"].item() == 15
+        assert out["fps_is_pad"].item() is False
+
+    def test_fps_unaffected_by_disabled_dropout(self):
+        # `enable_optional_key_dropout=False` (the val-subset path) must keep
+        # the same emit behavior — fps is gated by `emit_fps`, not by dropout.
+        ds = _DummyBaseDataset(
+            num_cams=1,
+            meta_fps=25,
+            metadata_drop_all_prob=1.0,
+            metadata_drop_each_prob=1.0,
+        )
+        ds.enable_optional_key_dropout = False
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert out["fps"].item() == 25
+        assert out["fps_is_pad"].item() is False
+
+    def test_default_collate_stacks_mixed_fps(self):
+        """Heterogeneous-fps batches (different datasets in a mixture) must
+        collate cleanly: the per-sample `fps` tensors become a (B,) long
+        tensor and `fps_is_pad` becomes a (B,) bool tensor."""
+        from torch.utils.data import default_collate
+
+        ds_a = _DummyBaseDataset(num_cams=1, meta_fps=30)
+        ds_b = _DummyBaseDataset(num_cams=1, meta_fps=50)
+        item_a = ds_a._to_standard_data_format(_full_raw_item(ds_a))
+        item_b = ds_b._to_standard_data_format(_full_raw_item(ds_b))
+        batch = default_collate([item_a, item_b])
+        assert batch["fps"].dtype == torch.long
+        assert batch["fps"].tolist() == [30, 50]
+        assert batch["fps_is_pad"].tolist() == [False, False]
diff --git a/tests/envs/test_configs.py b/tests/envs/test_configs.py
index 3479c4ae..f475abd9 100644
--- a/tests/envs/test_configs.py
+++ b/tests/envs/test_configs.py
@@ -188,6 +188,23 @@ def test_all_none_default_passes(self):
         assert cfg.mistake is None
         assert cfg.robot_type is None
         assert cfg.control_mode is None
+        # `emit_fps` is a boolean toggle (not a None-presence field) and
+        # defaults to True — the policy receives an fps segment at eval
+        # unless the user explicitly opts out.
+        assert cfg.emit_fps is True
+
+    def test_emit_fps_can_be_disabled(self):
+        cfg = EnvMetadataConfig(emit_fps=False)
+        assert cfg.emit_fps is False
+
+    def test_emit_fps_kept_orthogonal_to_other_fields(self):
+        """Setting ``emit_fps`` must not perturb the other fields' defaults."""
+        cfg = EnvMetadataConfig(emit_fps=False)
+        assert cfg.speed is None
+        assert cfg.quality is None
+        assert cfg.mistake is None
+        assert cfg.robot_type is None
+        assert cfg.control_mode is None
 
     @pytest.mark.parametrize("mode", ["joint", "ee"])
     @pytest.mark.parametrize("speed", [0, 10, 50, 90, 100])
diff --git a/tests/envs/test_utils.py b/tests/envs/test_utils.py
index fdbd6e9e..ebb865a8 100644
--- a/tests/envs/test_utils.py
+++ b/tests/envs/test_utils.py
@@ -208,13 +208,21 @@ def _make_obs(batch_size: int = 2, device: str = "cpu") -> dict:
     return {"state": torch.zeros(batch_size, 8, device=device, dtype=torch.float32)}
 
 
-def _make_cfg(**metadata_kwargs) -> SimpleNamespace:
-    """Build a duck-typed ``cfg`` exposing only ``cfg.env.metadata``.
+def _make_cfg(*, fps: int = 30, **metadata_kwargs) -> SimpleNamespace:
+    """Build a duck-typed ``cfg`` exposing ``cfg.env.metadata`` and
+    ``cfg.env.fps``.
 
     Avoids constructing a full ``TrainPipelineConfig`` (which pulls in
-    optimizer/dataset/policy validation) just to read five attributes.
+    optimizer/dataset/policy validation) just to read a handful of attributes.
+    ``fps`` defaults to 30 (matches ``EnvConfig.fps``) so the
+    ``emit_fps=True`` default path produces a deterministic value.
     """
-    return SimpleNamespace(env=SimpleNamespace(metadata=EnvMetadataConfig(**metadata_kwargs)))
+    return SimpleNamespace(
+        env=SimpleNamespace(
+            metadata=EnvMetadataConfig(**metadata_kwargs),
+            fps=fps,
+        )
+    )
 
 
 class TestAddEvalMetadata:
@@ -224,13 +232,31 @@ class TestAddEvalMetadata:
     silently changes the prefix the model sees at eval.
     """
 
-    def test_all_none_default_skips_every_key(self):
+    def test_all_none_default_skips_user_keys_but_still_emits_fps(self):
+        """``EnvMetadataConfig`` defaults: every user-controlled field is
+        ``None`` *but* ``emit_fps`` defaults to ``True``, so the helper still
+        injects ``fps`` / ``fps_is_pad`` from ``cfg.env.fps``.
+        """
         obs = _make_obs()
         original_keys = set(obs.keys())
         out = add_eval_metadata(obs, cfg=_make_cfg())
         assert out is obs  # mutated-in-place contract
+        new_keys = set(out.keys()) - original_keys
+        assert new_keys == {"fps", "fps_is_pad"}, (
+            f"unexpected injected keys: {new_keys}; only fps/fps_is_pad should appear "
+            f"under the all-None metadata default"
+        )
+
+    def test_emit_fps_false_skips_fps(self):
+        """``emit_fps=False`` on ``EnvMetadataConfig`` reverts the helper to the
+        legacy "skip all when nothing set" behaviour — useful when resuming a
+        checkpoint trained without fps conditioning.
+        """
+        obs = _make_obs()
+        original_keys = set(obs.keys())
+        out = add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False))
         assert set(out.keys()) == original_keys, (
-            "no metadata keys should be injected when every field is None"
+            "no keys should be injected when emit_fps=False and every metadata field is None"
         )
 
     @pytest.mark.parametrize("batch_size", [1, 4])
@@ -291,12 +317,22 @@ def test_string_fields_broadcast_as_list(self, key, value):
         assert f"{key}_is_pad" not in obs, "string fields use empty-string as pad signal, not a flag"
 
     def test_partial_fields_only_inject_specified_keys(self):
+        # `emit_fps=False` so this test isolates the user-set fields from the
+        # default fps injection (covered separately in test_emit_fps_*).
         obs = _make_obs()
-        add_eval_metadata(obs, cfg=_make_cfg(speed=30, robot_type="UR5"))
+        add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5"))
 
         assert "speed" in obs and "speed_is_pad" in obs
         assert "robot_type" in obs
-        for absent in ("quality", "quality_is_pad", "mistake", "mistake_is_pad", "control_mode"):
+        for absent in (
+            "quality",
+            "quality_is_pad",
+            "mistake",
+            "mistake_is_pad",
+            "control_mode",
+            "fps",
+            "fps_is_pad",
+        ):
             assert absent not in obs, f"unset field {absent} should not be injected"
 
     def test_device_propagation(self):
@@ -313,3 +349,42 @@ def test_device_propagation(self):
         assert obs["speed_is_pad"].device.type == "meta"
         assert obs["mistake"].device.type == "meta"
         assert obs["mistake_is_pad"].device.type == "meta"
+        # fps gets injected too under the default `emit_fps=True`; it must
+        # share the same device as state.
+        assert obs["fps"].device.type == "meta"
+        assert obs["fps_is_pad"].device.type == "meta"
+
+    @pytest.mark.parametrize("batch_size", [1, 4])
+    def test_fps_injects_long_tensor_from_env_fps(self, batch_size):
+        """``emit_fps=True`` (default) → broadcast ``cfg.env.fps`` as a
+        ``(B,)`` torch.long tensor with ``fps_is_pad=False`` rows."""
+        obs = _make_obs(batch_size=batch_size)
+        add_eval_metadata(obs, cfg=_make_cfg(fps=20))  # LiberoEnv-style fps
+
+        assert obs["fps"].dtype == torch.long
+        assert obs["fps"].shape == (batch_size,)
+        assert torch.equal(obs["fps"], torch.full((batch_size,), 20, dtype=torch.long))
+
+        assert obs["fps_is_pad"].dtype == torch.bool
+        assert obs["fps_is_pad"].shape == (batch_size,)
+        assert not obs["fps_is_pad"].any()
+
+    def test_fps_value_comes_from_env_not_metadata(self):
+        """The fps value source is ``cfg.env.fps`` (env stepping freq), NOT a
+        field on ``EnvMetadataConfig`` (which only carries the on/off toggle)."""
+        obs = _make_obs(batch_size=2)
+        add_eval_metadata(obs, cfg=_make_cfg(fps=50))
+
+        assert torch.equal(obs["fps"], torch.full((2,), 50, dtype=torch.long))
+
+    def test_fps_disabled_skips_only_fps_keys(self):
+        """``emit_fps=False`` skips the fps injection but leaves other set
+        fields alone — they go through the regular None-presence gating."""
+        obs = _make_obs(batch_size=2)
+        add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5"))
+
+        assert "fps" not in obs
+        assert "fps_is_pad" not in obs
+        # Existing user-set fields still flow.
+        assert "speed" in obs
+        assert obs["robot_type"] == ["UR5", "UR5"]
diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py
index 88e254d7..59fde78d 100644
--- a/tests/policies/test_pi07_cpu.py
+++ b/tests/policies/test_pi07_cpu.py
@@ -542,6 +542,148 @@ def test_low_level_missing_speed_pad_does_not_fabricate_value(self):
             assert "Mistake:" not in line
 
 
+class TestPrepareMetadataFps:
+    """Verify the ``fps`` segment construction in the pi07 planners.
+
+    ``fps`` is the dataset's native frame rate. The planner tokenizes it
+    with the lowercase ``"fps: N, "`` header, positioned between
+    ``Robot:`` and ``Control:`` for consistency with the
+    ``Speed → Quality → Mistake → Robot → fps → Control`` ordering pinned
+    in the segment-assembly loop of all four pi07 / pi07_paligemma
+    modelling files. The segment is omitted entirely (no header, no
+    comma) when ``fps_is_pad`` is ``True`` or when the ``fps`` key is
+    absent from the batch.
+    """
+
+    @staticmethod
+    def _planner_methods():
+        from opentau.policies.pi07.high_level_planner.modeling_pi07_high_level import (
+            PI07HighLevelPlannerPolicy,
+        )
+        from opentau.policies.pi07.low_level.modeling_pi07_low_level import (
+            PI07LowLevelPolicy,
+        )
+
+        return {
+            "low": PI07LowLevelPolicy.prepare_metadata,
+            "high": PI07HighLevelPlannerPolicy.prepare_metadata,
+        }
+
+    @pytest.mark.parametrize("planner", ["low", "high"])
+    def test_fps_present_emits_lowercase_segment(self, planner):
+        """``fps=30`` non-pad → ``"fps: 30, "`` appears in the prefix."""
+        method = self._planner_methods()[planner]
+        fake, captured = _make_fake_planner()
+
+        batch_size = 2
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([30, 50], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False, False]),
+        }
+
+        method(fake, batch)
+
+        assert len(captured) == batch_size
+        for line, fps in zip(captured, [30, 50], strict=True):
+            assert line.startswith("Metadata: ")
+            assert f"fps: {fps}, " in line
+            # Lowercase per spec — must NOT be "Fps:" or "FPS:".
+            assert "Fps:" not in line
+            assert "FPS:" not in line
+
+    @pytest.mark.parametrize("planner", ["low", "high"])
+    def test_fps_padded_omits_segment(self, planner):
+        """``fps_is_pad=True`` → no ``"fps:"`` substring, no stray comma."""
+        method = self._planner_methods()[planner]
+        fake, captured = _make_fake_planner()
+
+        batch_size = 2
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([30, 30], dtype=torch.long),
+            "fps_is_pad": torch.tensor([True, True]),
+            "robot_type": ["franka", "ur5"],
+        }
+
+        method(fake, batch)
+
+        for line in captured:
+            assert "fps:" not in line
+            # No stray double commas where the fps segment would have lived.
+            assert ", ," not in line
+            # Robot segment still present (unaffected by fps padding).
+            assert "Robot: " in line
+
+    @pytest.mark.parametrize("planner", ["low", "high"])
+    def test_fps_key_absent_omits_segment(self, planner):
+        """Missing ``fps`` key entirely → batch.get default-pad path drops it."""
+        method = self._planner_methods()[planner]
+        fake, captured = _make_fake_planner()
+
+        batch_size = 3
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "robot_type": ["franka", "franka", "franka"],
+        }
+
+        method(fake, batch)
+
+        for line in captured:
+            assert "fps:" not in line
+
+    @pytest.mark.parametrize("planner", ["low", "high"])
+    def test_fps_slots_between_robot_and_control(self, planner):
+        """The new fps segment sits between Robot: and Control: in the prefix string."""
+        method = self._planner_methods()[planner]
+        fake, captured = _make_fake_planner()
+
+        batch_size = 1
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([20], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False]),
+            "robot_type": ["franka"],
+            "control_mode": ["joint"],
+        }
+
+        method(fake, batch)
+
+        line = captured[0]
+        robot_idx = line.index("Robot: ")
+        fps_idx = line.index("fps: 20")
+        control_idx = line.index("Control: ")
+        assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}"
+
+    @pytest.mark.parametrize("planner", ["low", "high"])
+    def test_fps_with_partial_segments(self, planner):
+        """fps + a subset of other metadata still produces a well-formed prefix."""
+        method = self._planner_methods()[planner]
+        fake, captured = _make_fake_planner()
+
+        batch_size = 2
+        # Sample 0: fps + robot_type only. Sample 1: fps + control_mode only.
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([30, 50], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False, False]),
+            "robot_type": ["franka", ""],
+            "control_mode": ["", "joint"],
+        }
+
+        method(fake, batch)
+
+        assert captured[0].startswith("Metadata: ")
+        assert "Robot: franka, " in captured[0]
+        assert "fps: 30, " in captured[0]
+        assert "Control:" not in captured[0]
+
+        assert captured[1].startswith("Metadata: ")
+        assert "Robot:" not in captured[1]
+        assert "fps: 50, " in captured[1]
+        assert "Control: joint, " in captured[1]
+
+
 # `embed_prefix` conditional-block guards
 #
 # The low-level component skips entire prefix blocks when their availability
diff --git a/tests/policies/test_pi07_paligemma_high_level_planner.py b/tests/policies/test_pi07_paligemma_high_level_planner.py
index a707e653..3014278b 100644
--- a/tests/policies/test_pi07_paligemma_high_level_planner.py
+++ b/tests/policies/test_pi07_paligemma_high_level_planner.py
@@ -322,3 +322,103 @@ def capture_embed_prefix_infer(*args, **kwargs):
         # Output shapes.
         assert memory_tokens.shape == (1, MEMORY_MAX_LENGTH)
         assert response_tokens.shape == (1, RESPONSE_MAX_LENGTH)
+
+
+# CPU-only tests for the metadata-string assembly loop in
+# ``PI07HighLevelPlannerPolicy.prepare_metadata``. Pin the contract that ``fps``
+# is tokenized with the lowercase ``"fps: N, "`` header positioned between
+# ``Speed/Quality/Mistake`` (this planner has no ``Robot:``/``Control:``
+# segments) and omitted entirely when ``fps_is_pad`` is True.
+
+
+def _pg_hl_make_fake_planner(metadata_max_length: int = 4):
+    """Construct a minimal ``self`` stand-in for the high-level pi07_paligemma
+    planner's ``prepare_metadata`` method.
+
+    The method only touches ``self.language_tokenizer`` and
+    ``self.config.metadata_max_length`` (no ``_hydrate_metadata_batch``), so a
+    plain SimpleNamespace with a stub tokenizer is enough — no PaliGemma
+    backbone, no HuggingFace download.
+    """
+    import types
+
+    captured: list[str] = []
+
+    def stub_call(metadata, **kwargs):
+        captured.extend(metadata)
+        batch_size = len(metadata)
+        max_length = kwargs.get("max_length", 4)
+        return {
+            "input_ids": torch.zeros((batch_size, max_length), dtype=torch.long),
+            "attention_mask": torch.zeros((batch_size, max_length), dtype=torch.long),
+        }
+
+    tokenizer = types.SimpleNamespace()
+    tokenizer.__call__ = stub_call
+    fake = types.SimpleNamespace(
+        language_tokenizer=tokenizer,
+        config=types.SimpleNamespace(metadata_max_length=metadata_max_length),
+    )
+    return fake, captured
+
+
+def _pg_hl_base_batch(batch_size: int) -> dict:
+    """Build a batch with the mandatory speed/quality/mistake keys.
+
+    Unlike pi07's ``prepare_metadata``, this planner reads ``batch["speed"]``
+    etc. directly (no ``batch.get`` fallback), so every required numeric
+    + ``_is_pad`` key must be present.
+    """
+    return {
+        "state": torch.zeros(batch_size, 1),
+        "speed": torch.tensor([50.0] * batch_size, dtype=torch.float32),
+        "quality": torch.tensor([3.0] * batch_size, dtype=torch.float32),
+        "mistake": torch.tensor([False] * batch_size, dtype=torch.bool),
+        "speed_is_pad": torch.tensor([True] * batch_size),
+        "quality_is_pad": torch.tensor([True] * batch_size),
+        "mistake_is_pad": torch.tensor([True] * batch_size),
+    }
+
+
+class TestPaligemmaHighLevelFpsSegment:
+    def test_fps_present_emits_lowercase_segment(self):
+        method = PI07HighLevelPlannerPolicy.prepare_metadata
+        fake, captured = _pg_hl_make_fake_planner()
+
+        batch = _pg_hl_base_batch(batch_size=2)
+        batch["fps"] = torch.tensor([30, 50], dtype=torch.long)
+        batch["fps_is_pad"] = torch.tensor([False, False])
+
+        method(fake, batch)
+
+        assert len(captured) == 2
+        for line, fps in zip(captured, [30, 50], strict=True):
+            assert line.startswith("Metadata: ")
+            assert f"fps: {fps}, " in line
+            assert "Fps:" not in line and "FPS:" not in line
+
+    def test_fps_padded_omits_segment(self):
+        method = PI07HighLevelPlannerPolicy.prepare_metadata
+        fake, captured = _pg_hl_make_fake_planner()
+
+        batch = _pg_hl_base_batch(batch_size=1)
+        batch["fps"] = torch.tensor([30], dtype=torch.long)
+        batch["fps_is_pad"] = torch.tensor([True])
+
+        method(fake, batch)
+
+        assert "fps:" not in captured[0]
+        # No stray comma where the fps segment would have lived.
+        assert ", ," not in captured[0]
+
+    def test_fps_key_absent_omits_segment(self):
+        method = PI07HighLevelPlannerPolicy.prepare_metadata
+        fake, captured = _pg_hl_make_fake_planner()
+
+        # Note: no "fps" / "fps_is_pad" keys at all — the `batch.get` defaults
+        # should kick in and produce no segment.
+        batch = _pg_hl_base_batch(batch_size=3)
+        method(fake, batch)
+
+        for line in captured:
+            assert "fps:" not in line
diff --git a/tests/policies/test_pi07_paligemma_low_level.py b/tests/policies/test_pi07_paligemma_low_level.py
index e09dca73..366c3593 100644
--- a/tests/policies/test_pi07_paligemma_low_level.py
+++ b/tests/policies/test_pi07_paligemma_low_level.py
@@ -2253,3 +2253,130 @@ def fake_sample_actions(batch, noise=None, action_prefix=None, delay=None):
         a_next = PI07PaligemmaLowLevelPolicy.select_action(policy, batch)
         assert calls["n"] == 2
         assert a_next[0, 0].item() == 2000.0
+
+
+# CPU-only tests for the metadata-string assembly loop in
+# ``PI07PaligemmaLowLevelPolicy.prepare_metadata``. Pin the contract that
+# ``fps`` is tokenized with the lowercase ``"fps: N, "`` header positioned
+# between ``Robot:`` and ``Control:`` and omitted entirely when ``fps_is_pad``
+# is True (or when the keys are missing — ``_hydrate_metadata_batch`` defaults
+# them to all-padded).
+
+
+def _pg_ll_make_fake_policy(metadata_max_length: int = 4):
+    """Construct a stubbed PI07PaligemmaLowLevelPolicy for prepare_metadata tests.
+
+    ``prepare_metadata`` calls ``self._hydrate_metadata_batch(batch)`` first,
+    so a bare ``SimpleNamespace`` doesn't work — we use
+    ``object.__new__(PI07PaligemmaLowLevelPolicy)`` to get a real instance
+    with the class methods bound, then stub only the tokenizer + config to
+    skip the PaliGemma backbone download.
+    """
+    import types
+
+    captured: list[str] = []
+
+    def stub_call(metadata, **kwargs):
+        captured.extend(metadata)
+        batch_size = len(metadata)
+        max_length = kwargs.get("max_length", 4)
+        return {
+            "input_ids": torch.zeros((batch_size, max_length), dtype=torch.long),
+            "attention_mask": torch.zeros((batch_size, max_length), dtype=torch.long),
+        }
+
+    tokenizer = types.SimpleNamespace()
+    tokenizer.__call__ = stub_call
+
+    policy = object.__new__(PI07PaligemmaLowLevelPolicy)
+    policy.language_tokenizer = tokenizer
+    policy.config = types.SimpleNamespace(metadata_max_length=metadata_max_length)
+    return policy, captured
+
+
+class TestPaligemmaLowLevelFpsSegment:
+    def test_fps_present_emits_lowercase_segment(self):
+        policy, captured = _pg_ll_make_fake_policy()
+        batch_size = 2
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([30, 50], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False, False]),
+        }
+        # _hydrate_metadata_batch will fill in speed/quality/mistake/* defaults
+        # (all padded) and empty robot_type/control_mode lists.
+
+        PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
+
+        assert len(captured) == batch_size
+        for line, fps in zip(captured, [30, 50], strict=True):
+            assert line.startswith("Metadata: ")
+            assert f"fps: {fps}, " in line
+            assert "Fps:" not in line and "FPS:" not in line
+
+    def test_fps_padded_omits_segment(self):
+        policy, captured = _pg_ll_make_fake_policy()
+        batch_size = 1
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([30], dtype=torch.long),
+            "fps_is_pad": torch.tensor([True]),
+            "robot_type": ["franka"],
+        }
+
+        PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
+
+        assert "fps:" not in captured[0]
+        assert ", ," not in captured[0]
+        assert "Robot: franka, " in captured[0]
+
+    def test_fps_key_absent_omits_segment(self):
+        """``_hydrate_metadata_batch`` defaults ``fps_is_pad`` to True when
+        absent, so missing keys collapse to the same "no segment" behaviour
+        as explicit-pad."""
+        policy, captured = _pg_ll_make_fake_policy()
+        batch_size = 3
+        batch = {"state": torch.zeros(batch_size, 1)}
+
+        PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
+
+        for line in captured:
+            assert "fps:" not in line
+
+    def test_fps_slots_between_robot_and_control(self):
+        policy, captured = _pg_ll_make_fake_policy()
+        batch_size = 1
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([20], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False]),
+            "robot_type": ["franka"],
+            "control_mode": ["joint"],
+        }
+
+        PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
+
+        line = captured[0]
+        robot_idx = line.index("Robot: ")
+        fps_idx = line.index("fps: 20")
+        control_idx = line.index("Control: ")
+        assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}"
+
+    def test_fps_default_torch_long(self):
+        """fps is hydrated with ``torch.long`` defaults; assert ``_row_long``
+        preserves the dtype through the broadcast path."""
+        policy, captured = _pg_ll_make_fake_policy()
+        # Scalar-broadcast fps (one element, fan out to (B,)).
+        batch_size = 3
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([25], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False]),
+        }
+
+        PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
+
+        # All three samples get fps: 25 from the scalar-broadcast.
+        assert len(captured) == 3
+        for line in captured:
+            assert "fps: 25, " in line

From 7236b5aec1a01a45d3abde7316009476015ef488 Mon Sep 17 00:00:00 2001
From: Shuheng Liu <wish1104@icloud.com>
Date: Fri, 22 May 2026 19:37:41 -0700
Subject: [PATCH 2/7] fix(datasets): emit effective fps, not native, when
 action_freq is set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_emit_optional_keys` previously emitted `meta.fps` regardless of the
mixture's `action_freq`. When the user pinned `action_freq=30.0` on a
mixture containing a 50 Hz dataset, `resolve_delta_timestamps` resampled
the chunk to 30 Hz but the prefix still said `fps: 50, ` — telling the
policy a different rate than the chunk actually carried.

Capture `_action_freq` on `BaseDataset` and emit
`action_freq if set else meta.fps` so the tokenized fps always matches
the chunk's effective rate.
---
 src/opentau/datasets/lerobot_dataset.py | 39 ++++++++++++++++---------
 tests/datasets/test_optional_keys.py    | 27 +++++++++++++++++
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py
index 9b3303ae..8fc43544 100644
--- a/src/opentau/datasets/lerobot_dataset.py
+++ b/src/opentau/datasets/lerobot_dataset.py
@@ -698,10 +698,12 @@ def __init__(self, cfg: TrainPipelineConfig):
         self.response_drop_prob = dm.response_drop_prob if dm else 0.0
         self.metadata_drop_all_prob = dm.metadata_drop_all_prob if dm else 0.0
         self.metadata_drop_each_prob = dm.metadata_drop_each_prob if dm else 0.0
-        # Whether `_emit_optional_keys` injects the dataset's native `meta.fps`
-        # under the `fps` / `fps_is_pad` keys. Independent of the dropout rolls
-        # above — fps is intrinsic to the dataset, not a noisy label.
+        # `_emit_optional_keys` emits the *effective* per-sample fps when this
+        # is True: the mixture's `action_freq` if set (every dataset has been
+        # resampled to that rate via `resolve_delta_timestamps`), else the
+        # dataset's native `meta.fps`. Independent of the dropout rolls above.
         self.emit_fps = dm.emit_fps if dm else True
+        self._action_freq = dm.action_freq if dm else None
         # Whether the above drop rolls actually fire. `make_dataset` flips this
         # off on the validation subset (unless `val_enable_optional_key_dropout`
         # is set). Subgoal *frame* selection (end-of-segment vs. uniform window)
@@ -879,14 +881,16 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None:
         pad signal — a consumer seeing ``""`` can assume the field was
         unavailable or was masked this step.
 
-        ``fps`` (the dataset's native frame rate) is emitted as a
-        ``torch.long`` scalar alongside ``fps_is_pad`` when
-        ``self.emit_fps`` is True. Unlike the other metadata fields, ``fps``
-        does **not** participate in the dropout rolls — it's an intrinsic
-        property of the dataset, not a noisy label — so it's always
-        non-padded when emitted. Set ``emit_fps=False`` on the mixture to
-        omit the keys entirely (the policy's ``prepare_metadata`` falls
-        through to its default-pad path).
+        ``fps`` (the *effective* per-sample frame rate — ``action_freq`` if
+        set on the mixture, else the dataset's native ``meta.fps``) is
+        emitted as a ``torch.long`` scalar alongside ``fps_is_pad`` when
+        ``self.emit_fps`` is True. Unlike the other metadata fields,
+        ``fps`` does **not** participate in the dropout rolls — it's an
+        intrinsic property of the (possibly resampled) chunk, not a noisy
+        label — so it's always non-padded when emitted. Set
+        ``emit_fps=False`` on the mixture to omit the keys entirely (the
+        policy's ``prepare_metadata`` falls through to its default-pad
+        path).
 
         Dropout order:
             1. ``history_state_drop_prob``: zero ``state`` and historical camera
@@ -989,10 +993,17 @@ def _roll(prob: float) -> bool:
             drop_this = drop_meta_all or _roll(self.metadata_drop_each_prob)
             standard_item[key] = "" if drop_this else val
 
-        # (6) Native fps: intrinsic dataset property, not a noisy label. Emitted
-        # unconditionally (no dropout) when `emit_fps` is enabled on the mixture.
+        # (6) Effective fps: the rate the action chunk actually runs at after
+        # any mixture-level resampling. When `action_freq` is set, every
+        # dataset's chunks are nearest-neighbor resampled to that rate by
+        # `resolve_delta_timestamps`, so tokenizing the dataset's native
+        # `meta.fps` would mislead the policy (the chunk timing differs).
+        # When `action_freq is None` (no resampling), the chunk runs at the
+        # dataset's native rate. Always non-pad (no dropout) when emit_fps
+        # is enabled on the mixture.
         if self.emit_fps:
-            standard_item["fps"] = torch.tensor(int(self.meta.fps), dtype=torch.long)
+            effective_fps = int(self._action_freq) if self._action_freq is not None else int(self.meta.fps)
+            standard_item["fps"] = torch.tensor(effective_fps, dtype=torch.long)
             standard_item["fps_is_pad"] = torch.tensor(False)
 
     def resize_with_pad(self, img, width, height, pad_value=0) -> torch.Tensor:
diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py
index 54d67071..60ee2f58 100644
--- a/tests/datasets/test_optional_keys.py
+++ b/tests/datasets/test_optional_keys.py
@@ -65,6 +65,7 @@ def __init__(
         metadata_drop_all_prob=0.0,
         metadata_drop_each_prob=0.0,
         emit_fps=True,
+        action_freq: float | None = None,
         meta_info: dict | None = None,
         meta_fps: int = 30,
     ):
@@ -82,6 +83,11 @@ def __init__(
         self.metadata_drop_all_prob = metadata_drop_all_prob
         self.metadata_drop_each_prob = metadata_drop_each_prob
         self.emit_fps = emit_fps
+        # Mirrors `BaseDataset._action_freq` (mixture-level `action_freq`).
+        # When set, `_emit_optional_keys` emits this value as the effective
+        # fps instead of `meta.fps` — matching the rate that
+        # `resolve_delta_timestamps` resampled the chunk to.
+        self._action_freq = action_freq
         self.enable_optional_key_dropout = True
         # Stub meta surface so _to_standard_data_format can read robot_type /
         # control_mode without instantiating a real LeRobotDatasetMetadata.
@@ -764,3 +770,24 @@ def test_default_collate_stacks_mixed_fps(self):
         assert batch["fps"].dtype == torch.long
         assert batch["fps"].tolist() == [30, 50]
         assert batch["fps_is_pad"].tolist() == [False, False]
+
+    def test_fps_reports_action_freq_when_resampling(self):
+        """When the mixture sets `action_freq`, every chunk is nearest-neighbor
+        resampled to that rate by `resolve_delta_timestamps`. The emitted
+        `fps` must reflect the *effective* rate of the resampled chunk, not
+        the dataset's native rate — otherwise the prefix tells the model a
+        different timing than the chunk actually carries.
+        """
+        # Dataset is natively 50 Hz, but the mixture pins everything to 30 Hz.
+        ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=30.0)
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert out["fps"].item() == 30
+        assert out["fps_is_pad"].item() is False
+
+    def test_fps_reports_native_when_action_freq_none(self):
+        """When `action_freq is None` (no resampling), the chunk runs at the
+        dataset's native rate — that's what gets emitted."""
+        ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=None)
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert out["fps"].item() == 50
+        assert out["fps_is_pad"].item() is False

From fe04ffd17d8e0051af27ae8c74d685950339bb5f Mon Sep 17 00:00:00 2001
From: Shuheng Liu <wish1104@icloud.com>
Date: Fri, 22 May 2026 19:41:59 -0700
Subject: [PATCH 3/7] style(pi07): capitalize fps header to FPS for sibling
 consistency

---
 .../modeling_pi07_high_level.py               |  2 +-
 .../pi07/low_level/modeling_pi07_low_level.py |  2 +-
 .../modeling_pi07_high_level.py               |  2 +-
 .../low_level/modeling_pi07_low_level.py      | 13 +++----
 tests/policies/test_pi07_cpu.py               | 35 ++++++++++---------
 .../test_pi07_paligemma_high_level_planner.py | 13 +++----
 .../policies/test_pi07_paligemma_low_level.py | 27 +++++++-------
 7 files changed, 49 insertions(+), 45 deletions(-)

diff --git a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py
index d320bce6..e24fbd74 100644
--- a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py
+++ b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py
@@ -740,7 +740,7 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
                 segments.append(f"Robot: {robot_type}, ")
 
             if not fps_is_pad:
-                segments.append(f"fps: {str(fps.item())}, ")
+                segments.append(f"FPS: {str(fps.item())}, ")
 
             if control_mode:
                 segments.append(f"Control: {control_mode}, ")
diff --git a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py
index 6ca13d3c..f552a5ec 100644
--- a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py
+++ b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py
@@ -1140,7 +1140,7 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
                 segments.append(f"Robot: {robot_type}, ")
 
             if not fps_is_pad:
-                segments.append(f"fps: {str(fps.item())}, ")
+                segments.append(f"FPS: {str(fps.item())}, ")
 
             if control_mode:
                 segments.append(f"Control: {control_mode}, ")
diff --git a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py
index 686e4f38..3e600da1 100644
--- a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py
+++ b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py
@@ -696,7 +696,7 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
                 segments.append(f"Mistake: {str(mistake.item())}, ")
 
             if not fps_is_pad:
-                segments.append(f"fps: {str(fps.item())}, ")
+                segments.append(f"FPS: {str(fps.item())}, ")
 
             metadata.append(f"Metadata: {' '.join(segments)}")
 
diff --git a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py
index 831585a6..c5c308d6 100644
--- a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py
+++ b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py
@@ -1385,16 +1385,17 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]:
         """Tokenize episode metadata into PaliGemma token IDs.
 
         Builds strings ``Metadata: Speed: … Quality: … Mistake: … Robot: …
-        fps: … Control: …`` only from fields whose ``*_is_pad`` flag is
+        FPS: … Control: …`` only from fields whose ``*_is_pad`` flag is
         ``False`` (``robot_type`` / ``control_mode`` are strings, omitted
         when empty). For each sample, a padded or empty field is omitted
         entirely (not concatenated to ``segments``). When every field is
         pad, the row is ``""`` before tokenization.
 
-        ``fps`` is the dataset's native frame rate (``torch.long``); the
-        segment is omitted when ``fps_is_pad`` is True. Segment ordering is
-        ``Speed → Quality → Mistake → Robot → fps → Control`` to match the
-        other pi07 / pi07_paligemma ``prepare_metadata`` implementations.
+        ``fps`` is the effective per-sample frame rate (``torch.long``);
+        the segment is omitted when ``fps_is_pad`` is True. Segment
+        ordering is ``Speed → Quality → Mistake → Robot → FPS → Control``
+        to match the other pi07 / pi07_paligemma ``prepare_metadata``
+        implementations.
 
         Always runs :meth:`_hydrate_metadata_batch` first so callers see the same
         outcomes whether they hit :meth:`sample_actions` (often no metadata
@@ -1512,7 +1513,7 @@ def _row_bool(key: str) -> Tensor:
             if robot_type:
                 segments.append(f"Robot: {robot_type}, ")
             if not fps_is_pad.item():
-                segments.append(f"fps: {str(fps.item())}, ")
+                segments.append(f"FPS: {str(fps.item())}, ")
             if control_mode:
                 segments.append(f"Control: {control_mode}, ")
             metadata_rows.append(f"Metadata: {' '.join(segments)}" if segments else "")
diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py
index 59fde78d..c710f680 100644
--- a/tests/policies/test_pi07_cpu.py
+++ b/tests/policies/test_pi07_cpu.py
@@ -545,11 +545,12 @@ def test_low_level_missing_speed_pad_does_not_fabricate_value(self):
 class TestPrepareMetadataFps:
     """Verify the ``fps`` segment construction in the pi07 planners.
 
-    ``fps`` is the dataset's native frame rate. The planner tokenizes it
-    with the lowercase ``"fps: N, "`` header, positioned between
-    ``Robot:`` and ``Control:`` for consistency with the
-    ``Speed → Quality → Mistake → Robot → fps → Control`` ordering pinned
-    in the segment-assembly loop of all four pi07 / pi07_paligemma
+    ``fps`` is the effective per-sample frame rate. The planner tokenizes
+    it with the ``"FPS: N, "`` header (uppercase to match the sibling
+    ``Speed:``/``Quality:``/``Mistake:``/``Robot:``/``Control:`` labels),
+    positioned between ``Robot:`` and ``Control:`` for consistency with
+    the ``Speed → Quality → Mistake → Robot → FPS → Control`` ordering
+    pinned in the segment-assembly loop of all four pi07 / pi07_paligemma
     modelling files. The segment is omitted entirely (no header, no
     comma) when ``fps_is_pad`` is ``True`` or when the ``fps`` key is
     absent from the batch.
@@ -570,8 +571,8 @@ def _planner_methods():
         }
 
     @pytest.mark.parametrize("planner", ["low", "high"])
-    def test_fps_present_emits_lowercase_segment(self, planner):
-        """``fps=30`` non-pad → ``"fps: 30, "`` appears in the prefix."""
+    def test_fps_present_emits_uppercase_segment(self, planner):
+        """``fps=30`` non-pad → ``"FPS: 30, "`` appears in the prefix."""
         method = self._planner_methods()[planner]
         fake, captured = _make_fake_planner()
 
@@ -587,14 +588,14 @@ def test_fps_present_emits_lowercase_segment(self, planner):
         assert len(captured) == batch_size
         for line, fps in zip(captured, [30, 50], strict=True):
             assert line.startswith("Metadata: ")
-            assert f"fps: {fps}, " in line
-            # Lowercase per spec — must NOT be "Fps:" or "FPS:".
+            assert f"FPS: {fps}, " in line
+            # Uppercase per spec — must NOT be "fps:" or "Fps:".
+            assert "fps:" not in line
             assert "Fps:" not in line
-            assert "FPS:" not in line
 
     @pytest.mark.parametrize("planner", ["low", "high"])
     def test_fps_padded_omits_segment(self, planner):
-        """``fps_is_pad=True`` → no ``"fps:"`` substring, no stray comma."""
+        """``fps_is_pad=True`` → no ``"FPS:"`` substring, no stray comma."""
         method = self._planner_methods()[planner]
         fake, captured = _make_fake_planner()
 
@@ -609,7 +610,7 @@ def test_fps_padded_omits_segment(self, planner):
         method(fake, batch)
 
         for line in captured:
-            assert "fps:" not in line
+            assert "FPS:" not in line
             # No stray double commas where the fps segment would have lived.
             assert ", ," not in line
             # Robot segment still present (unaffected by fps padding).
@@ -630,7 +631,7 @@ def test_fps_key_absent_omits_segment(self, planner):
         method(fake, batch)
 
         for line in captured:
-            assert "fps:" not in line
+            assert "FPS:" not in line
 
     @pytest.mark.parametrize("planner", ["low", "high"])
     def test_fps_slots_between_robot_and_control(self, planner):
@@ -651,9 +652,9 @@ def test_fps_slots_between_robot_and_control(self, planner):
 
         line = captured[0]
         robot_idx = line.index("Robot: ")
-        fps_idx = line.index("fps: 20")
+        fps_idx = line.index("FPS: 20")
         control_idx = line.index("Control: ")
-        assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}"
+        assert robot_idx < fps_idx < control_idx, f"Expected Robot < FPS < Control ordering in {line!r}"
 
     @pytest.mark.parametrize("planner", ["low", "high"])
     def test_fps_with_partial_segments(self, planner):
@@ -675,12 +676,12 @@ def test_fps_with_partial_segments(self, planner):
 
         assert captured[0].startswith("Metadata: ")
         assert "Robot: franka, " in captured[0]
-        assert "fps: 30, " in captured[0]
+        assert "FPS: 30, " in captured[0]
         assert "Control:" not in captured[0]
 
         assert captured[1].startswith("Metadata: ")
         assert "Robot:" not in captured[1]
-        assert "fps: 50, " in captured[1]
+        assert "FPS: 50, " in captured[1]
         assert "Control: joint, " in captured[1]
 
 
diff --git a/tests/policies/test_pi07_paligemma_high_level_planner.py b/tests/policies/test_pi07_paligemma_high_level_planner.py
index 3014278b..bc6348a6 100644
--- a/tests/policies/test_pi07_paligemma_high_level_planner.py
+++ b/tests/policies/test_pi07_paligemma_high_level_planner.py
@@ -326,7 +326,8 @@ def capture_embed_prefix_infer(*args, **kwargs):
 
 # CPU-only tests for the metadata-string assembly loop in
 # ``PI07HighLevelPlannerPolicy.prepare_metadata``. Pin the contract that ``fps``
-# is tokenized with the lowercase ``"fps: N, "`` header positioned between
+# is tokenized with the ``"FPS: N, "`` header (uppercase to match the sibling
+# ``Speed:``/``Quality:``/``Mistake:`` labels) positioned after
 # ``Speed/Quality/Mistake`` (this planner has no ``Robot:``/``Control:``
 # segments) and omitted entirely when ``fps_is_pad`` is True.
 
@@ -381,7 +382,7 @@ def _pg_hl_base_batch(batch_size: int) -> dict:
 
 
 class TestPaligemmaHighLevelFpsSegment:
-    def test_fps_present_emits_lowercase_segment(self):
+    def test_fps_present_emits_uppercase_segment(self):
         method = PI07HighLevelPlannerPolicy.prepare_metadata
         fake, captured = _pg_hl_make_fake_planner()
 
@@ -394,8 +395,8 @@ def test_fps_present_emits_lowercase_segment(self):
         assert len(captured) == 2
         for line, fps in zip(captured, [30, 50], strict=True):
             assert line.startswith("Metadata: ")
-            assert f"fps: {fps}, " in line
-            assert "Fps:" not in line and "FPS:" not in line
+            assert f"FPS: {fps}, " in line
+            assert "fps:" not in line and "Fps:" not in line
 
     def test_fps_padded_omits_segment(self):
         method = PI07HighLevelPlannerPolicy.prepare_metadata
@@ -407,7 +408,7 @@ def test_fps_padded_omits_segment(self):
 
         method(fake, batch)
 
-        assert "fps:" not in captured[0]
+        assert "FPS:" not in captured[0]
         # No stray comma where the fps segment would have lived.
         assert ", ," not in captured[0]
 
@@ -421,4 +422,4 @@ def test_fps_key_absent_omits_segment(self):
         method(fake, batch)
 
         for line in captured:
-            assert "fps:" not in line
+            assert "FPS:" not in line
diff --git a/tests/policies/test_pi07_paligemma_low_level.py b/tests/policies/test_pi07_paligemma_low_level.py
index 366c3593..aeced8f8 100644
--- a/tests/policies/test_pi07_paligemma_low_level.py
+++ b/tests/policies/test_pi07_paligemma_low_level.py
@@ -2257,10 +2257,11 @@ def fake_sample_actions(batch, noise=None, action_prefix=None, delay=None):
 
 # CPU-only tests for the metadata-string assembly loop in
 # ``PI07PaligemmaLowLevelPolicy.prepare_metadata``. Pin the contract that
-# ``fps`` is tokenized with the lowercase ``"fps: N, "`` header positioned
-# between ``Robot:`` and ``Control:`` and omitted entirely when ``fps_is_pad``
-# is True (or when the keys are missing — ``_hydrate_metadata_batch`` defaults
-# them to all-padded).
+# ``fps`` is tokenized with the ``"FPS: N, "`` header (uppercase to match
+# the sibling ``Speed:``/``Quality:``/``Mistake:``/``Robot:``/``Control:``
+# labels) positioned between ``Robot:`` and ``Control:`` and omitted entirely
+# when ``fps_is_pad`` is True (or when the keys are missing —
+# ``_hydrate_metadata_batch`` defaults them to all-padded).
 
 
 def _pg_ll_make_fake_policy(metadata_max_length: int = 4):
@@ -2295,7 +2296,7 @@ def stub_call(metadata, **kwargs):
 
 
 class TestPaligemmaLowLevelFpsSegment:
-    def test_fps_present_emits_lowercase_segment(self):
+    def test_fps_present_emits_uppercase_segment(self):
         policy, captured = _pg_ll_make_fake_policy()
         batch_size = 2
         batch = {
@@ -2311,8 +2312,8 @@ def test_fps_present_emits_lowercase_segment(self):
         assert len(captured) == batch_size
         for line, fps in zip(captured, [30, 50], strict=True):
             assert line.startswith("Metadata: ")
-            assert f"fps: {fps}, " in line
-            assert "Fps:" not in line and "FPS:" not in line
+            assert f"FPS: {fps}, " in line
+            assert "fps:" not in line and "Fps:" not in line
 
     def test_fps_padded_omits_segment(self):
         policy, captured = _pg_ll_make_fake_policy()
@@ -2326,7 +2327,7 @@ def test_fps_padded_omits_segment(self):
 
         PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
 
-        assert "fps:" not in captured[0]
+        assert "FPS:" not in captured[0]
         assert ", ," not in captured[0]
         assert "Robot: franka, " in captured[0]
 
@@ -2341,7 +2342,7 @@ def test_fps_key_absent_omits_segment(self):
         PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
 
         for line in captured:
-            assert "fps:" not in line
+            assert "FPS:" not in line
 
     def test_fps_slots_between_robot_and_control(self):
         policy, captured = _pg_ll_make_fake_policy()
@@ -2358,9 +2359,9 @@ def test_fps_slots_between_robot_and_control(self):
 
         line = captured[0]
         robot_idx = line.index("Robot: ")
-        fps_idx = line.index("fps: 20")
+        fps_idx = line.index("FPS: 20")
         control_idx = line.index("Control: ")
-        assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}"
+        assert robot_idx < fps_idx < control_idx, f"Expected Robot < FPS < Control ordering in {line!r}"
 
     def test_fps_default_torch_long(self):
         """fps is hydrated with ``torch.long`` defaults; assert ``_row_long``
@@ -2376,7 +2377,7 @@ def test_fps_default_torch_long(self):
 
         PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
 
-        # All three samples get fps: 25 from the scalar-broadcast.
+        # All three samples get FPS: 25 from the scalar-broadcast.
         assert len(captured) == 3
         for line in captured:
-            assert "fps: 25, " in line
+            assert "FPS: 25, " in line

From 07ce696a3bb428eac4a216899303a9328f7dfb58 Mon Sep 17 00:00:00 2001
From: Shuheng Liu <wish1104@icloud.com>
Date: Fri, 22 May 2026 19:54:26 -0700
Subject: [PATCH 4/7] docs(concepts): document fps in standard data format

---
 docs/source/concepts.rst | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst
index 200034f3..22413879 100644
--- a/docs/source/concepts.rst
+++ b/docs/source/concepts.rst
@@ -28,7 +28,7 @@ This class:
 *   Combines multiple ``LeRobotDataset`` and ``VQADataset`` instances.
 *   Different weights can be assigned to each dataset to control the sampling frequency; if weights are omitted (or set to ``null`` in JSON), weights default to dataset lengths.
 *   Aggregates statistics from all constituent datasets to ensure consistent normalization across the mixture.
-*   Resamples the action output frequency to match the action frequency specified in the configuration.
+*   Resamples the action output frequency to match the ``action_freq`` specified in the configuration. When ``action_freq`` is ``None`` (the default), resampling is disabled — each dataset is sampled at its native fps and a single batch can mix samples spanning different real-time horizons (set ``action_freq`` to a positive float to re-anchor every dataset to a common rate). The per-sample effective rate is surfaced via the ``fps`` standard-format key (see :ref:`standard-data-format-optional-keys`) so the policy can condition on it.
 
 Metadata
 ^^^^^^^^
@@ -130,6 +130,21 @@ Like ``speed``, ``mistake``, and ``quality``, they participate in the
 ``metadata_drop_all_prob`` / ``metadata_drop_each_prob`` dropout rolls —
 see :ref:`Training-time dropout <standard-data-format-optional-keys-dropout>`.
 
+``fps`` is the **effective per-sample frame rate** of the (possibly
+resampled) action chunk: ``DatasetMixtureConfig.action_freq`` when set,
+otherwise the dataset's native ``meta.fps``. Heterogeneous-frequency
+mixtures (``action_freq=None``) need it so the policy can condition on
+each sample's rate — a 30 Hz chunk and a 50 Hz chunk carry different
+real-time horizons even when both are ``chunk_size`` frames long. Unlike
+the other metadata fields, ``fps`` does **not** participate in the
+dropout rolls — it's an intrinsic property of the chunk, not a noisy
+label, so it's always non-pad when emitted. Emission is gated by
+``DatasetMixtureConfig.emit_fps`` (default ``True``); set it to
+``False`` to omit both ``fps`` and ``fps_is_pad`` from the sample dict
+(useful when resuming a checkpoint that was trained without fps
+conditioning, so the policy's metadata prefix doesn't gain an unfamiliar
+``FPS:`` segment).
+
 .. code-block:: python
 
     {
@@ -182,6 +197,19 @@ see :ref:`Training-time dropout <standard-data-format-optional-keys-dropout>`.
         "quality": torch.LongTensor,   # Scalar in {1,2,3,4,5}; episode-level quality score.
         "quality_is_pad": torch.BoolTensor,
 
+        "fps": torch.LongTensor,       # Scalar; effective per-sample frame rate of the action chunk.
+                                       # When `DatasetMixtureConfig.action_freq` is set, every dataset is
+                                       # resampled to that rate (via `resolve_delta_timestamps`) and `fps`
+                                       # reports `action_freq`. When `action_freq is None` (the default),
+                                       # the chunk runs at the dataset's native `meta.fps`. Gated by
+                                       # `DatasetMixtureConfig.emit_fps` (default `True`); the key is
+                                       # omitted entirely when `emit_fps=False`. Does NOT participate in
+                                       # `metadata_drop_*_prob` — fps is an intrinsic property of the
+                                       # chunk, not a noisy label.
+        "fps_is_pad": torch.BoolTensor,    # Always False when emitted (no dropout). Defaults to True only
+                                           # if a hand-built inference batch supplies `fps` without the
+                                           # paired flag — `prepare_metadata` then drops the segment.
+
         "subgoal0": torch.Tensor,       # shape (3, H, W), values in [0,1]. A single future frame from
                                         # camera0 sampled either at end-of-segment (with probability
                                         # `subgoal_end_of_segment_prob`) or uniformly in [t, t+4 seconds].

From 090b43659e5ed3d13a17f28d31fe2ffdf085c4a6 Mon Sep 17 00:00:00 2001
From: Shuheng Liu <wish1104@icloud.com>
Date: Fri, 22 May 2026 19:59:52 -0700
Subject: [PATCH 5/7] fix(datasets): emit_fps no longer crashes on VQA samples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_emit_optional_keys` accessed `self.meta.fps` unconditionally when
`emit_fps=True`, raising AttributeError on VQA datasets — whose
`VQADatasetMetadata` (inheriting `pass` from `DatasetMetadata`) has no
`fps` property. Any heterogeneous VLA + VQA mixture trained under the
default config crashed at the first VQA sample fetch.

Surface `fps -> None` on the `DatasetMetadata` base (overridden to int
on `LeRobotDatasetMetadata`), then have `_emit_optional_keys` pad out
the row (`fps=0, fps_is_pad=True`) when meta.fps is None. The policy's
`prepare_metadata` already drops the `FPS:` segment for padded samples,
so heterogeneous batches stay schema-aligned and the prefix omits the
fps token for VQA rows.
---
 docs/source/concepts.rst                |  6 ++-
 src/opentau/datasets/lerobot_dataset.py | 50 +++++++++++++----
 tests/datasets/test_optional_keys.py    | 71 +++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 11 deletions(-)

diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst
index 22413879..b0d790f9 100644
--- a/docs/source/concepts.rst
+++ b/docs/source/concepts.rst
@@ -138,7 +138,11 @@ each sample's rate — a 30 Hz chunk and a 50 Hz chunk carry different
 real-time horizons even when both are ``chunk_size`` frames long. Unlike
 the other metadata fields, ``fps`` does **not** participate in the
 dropout rolls — it's an intrinsic property of the chunk, not a noisy
-label, so it's always non-pad when emitted. Emission is gated by
+label, so it's always non-pad when emitted from a dataset that has a
+real frame rate. Samples from VQA datasets (no temporal axis) emit
+``fps=0, fps_is_pad=True`` so heterogeneous VLA + VQA mixtures stay
+schema-aligned across the batch; the policy's ``prepare_metadata``
+then drops the ``FPS:`` segment for those rows. Emission is gated by
 ``DatasetMixtureConfig.emit_fps`` (default ``True``); set it to
 ``False`` to omit both ``fps`` and ``fps_is_pad`` from the sample dict
 (useful when resuming a checkpoint that was trained without fps
diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py
index 8fc43544..b756f821 100644
--- a/src/opentau/datasets/lerobot_dataset.py
+++ b/src/opentau/datasets/lerobot_dataset.py
@@ -301,9 +301,28 @@ def shapes(self) -> dict:
         """Shapes for the different features."""
         return {key: tuple(ft["shape"]) for key, ft in self.features.items()}
 
+    @property
+    def fps(self) -> int | None:
+        """Native frame rate of the dataset.
+
+        Returns ``None`` for datasets without a temporal axis (e.g. VQA
+        image-text datasets). Subclasses with real frame-rate metadata
+        (``LeRobotDatasetMetadata``) override this to return the int from
+        ``info["fps"]``. Downstream callers that emit fps as a sample key
+        treat ``None`` as the pad signal so heterogeneous mixtures
+        (VLA + VQA) stay batchable.
+        """
+        return None
+
 
 class VQADatasetMetadata(DatasetMetadata):
-    """Metadata class for vqa datasets (vision-language datasets)."""
+    """Metadata class for vqa datasets (vision-language datasets).
+
+    Inherits ``fps -> None`` from :class:`DatasetMetadata` since VQA
+    samples have no temporal axis. :meth:`BaseDataset._emit_optional_keys`
+    sees this and emits ``fps_is_pad=True`` for VQA samples in a mixture,
+    keeping every batch row schema-aligned with the LeRobot samples.
+    """
 
     pass
 
@@ -887,10 +906,13 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None:
         ``self.emit_fps`` is True. Unlike the other metadata fields,
         ``fps`` does **not** participate in the dropout rolls — it's an
         intrinsic property of the (possibly resampled) chunk, not a noisy
-        label — so it's always non-padded when emitted. Set
-        ``emit_fps=False`` on the mixture to omit the keys entirely (the
-        policy's ``prepare_metadata`` falls through to its default-pad
-        path).
+        label — so it's always non-padded for samples that have a real
+        frame rate. Samples from VQA datasets (where
+        :attr:`DatasetMetadata.fps` is ``None``) emit
+        ``fps=0, fps_is_pad=True`` so a heterogeneous VLA + VQA mixture
+        stays schema-aligned across the batch. Set ``emit_fps=False`` on
+        the mixture to omit the keys entirely (the policy's
+        ``prepare_metadata`` falls through to its default-pad path).
 
         Dropout order:
             1. ``history_state_drop_prob``: zero ``state`` and historical camera
@@ -999,12 +1021,20 @@ def _roll(prob: float) -> bool:
         # `resolve_delta_timestamps`, so tokenizing the dataset's native
         # `meta.fps` would mislead the policy (the chunk timing differs).
         # When `action_freq is None` (no resampling), the chunk runs at the
-        # dataset's native rate. Always non-pad (no dropout) when emit_fps
-        # is enabled on the mixture.
+        # dataset's native rate. Datasets without a temporal axis (VQA;
+        # `DatasetMetadata.fps` returns ``None``) emit pad so heterogeneous
+        # VLA + VQA mixtures stay schema-aligned across the batch. Always
+        # non-pad (no dropout) for real-rate samples when emit_fps is
+        # enabled on the mixture.
         if self.emit_fps:
-            effective_fps = int(self._action_freq) if self._action_freq is not None else int(self.meta.fps)
-            standard_item["fps"] = torch.tensor(effective_fps, dtype=torch.long)
-            standard_item["fps_is_pad"] = torch.tensor(False)
+            native_fps = self.meta.fps
+            if native_fps is None:
+                standard_item["fps"] = torch.tensor(0, dtype=torch.long)
+                standard_item["fps_is_pad"] = torch.tensor(True)
+            else:
+                effective_fps = int(self._action_freq) if self._action_freq is not None else int(native_fps)
+                standard_item["fps"] = torch.tensor(effective_fps, dtype=torch.long)
+                standard_item["fps_is_pad"] = torch.tensor(False)
 
     def resize_with_pad(self, img, width, height, pad_value=0) -> torch.Tensor:
         """Resize an image to target dimensions with padding.
diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py
index 60ee2f58..81747bfa 100644
--- a/tests/datasets/test_optional_keys.py
+++ b/tests/datasets/test_optional_keys.py
@@ -791,3 +791,74 @@ def test_fps_reports_native_when_action_freq_none(self):
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert out["fps"].item() == 50
         assert out["fps_is_pad"].item() is False
+
+
+# VQA datasets have no temporal axis — `DatasetMetadata.fps` returns ``None``
+# on the base class (and on ``VQADatasetMetadata``, which inherits via ``pass``).
+# `_emit_optional_keys` must catch that and emit ``fps=0, fps_is_pad=True``
+# instead of crashing on ``int(None)`` — otherwise any heterogeneous mixture
+# that includes a VQA dataset crashes at the first sample fetch under the
+# default ``emit_fps=True``.
+
+
+class TestFpsEmissionVQA:
+    def test_base_metadata_fps_returns_none(self):
+        """``DatasetMetadata.fps`` is ``None`` for non-LeRobot datasets."""
+        from opentau.datasets.lerobot_dataset import DatasetMetadata, VQADatasetMetadata
+
+        assert DatasetMetadata().fps is None
+        assert VQADatasetMetadata().fps is None
+
+    def test_vqa_metadata_emits_pad_row(self):
+        """A dataset whose ``meta.fps`` is ``None`` (i.e. a VQA dataset) must
+        still produce a complete sample dict so heterogeneous batches stay
+        schema-aligned. The fps row is padded out as
+        ``fps=0, fps_is_pad=True`` (mirroring the speed/quality/mistake pad
+        shape) — the policy's ``prepare_metadata`` then drops the ``FPS:``
+        segment for that sample.
+        """
+        from opentau.datasets.lerobot_dataset import VQADatasetMetadata
+
+        ds = _DummyBaseDataset(num_cams=1)
+        ds.meta = VQADatasetMetadata(info={"features": {}})
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert "fps" in out and "fps_is_pad" in out
+        assert out["fps"].dtype == torch.long
+        assert out["fps"].item() == 0
+        assert out["fps_is_pad"].item() is True
+
+    def test_vqa_pad_ignores_action_freq(self):
+        """Even when the mixture sets ``action_freq``, a VQA sample emits
+        pad — VQA has no actions to be at any rate, so reporting
+        ``action_freq`` would be misleading."""
+        from opentau.datasets.lerobot_dataset import VQADatasetMetadata
+
+        ds = _DummyBaseDataset(num_cams=1, action_freq=30.0)
+        ds.meta = VQADatasetMetadata(info={"features": {}})
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert out["fps"].item() == 0
+        assert out["fps_is_pad"].item() is True
+
+    def test_heterogeneous_vla_vqa_collate(self):
+        """The pad-row trick keeps batches schema-aligned across a
+        VLA + VQA mixture: default_collate stacks fps into a ``(B,)`` long
+        tensor and fps_is_pad into a ``(B,)`` bool tensor with the
+        per-sample on/off correctly set.
+        """
+        from torch.utils.data import default_collate
+
+        from opentau.datasets.lerobot_dataset import VQADatasetMetadata
+
+        # LeRobot-shaped sample.
+        ds_vla = _DummyBaseDataset(num_cams=1, meta_fps=30)
+        item_vla = ds_vla._to_standard_data_format(_full_raw_item(ds_vla))
+
+        # VQA-shaped sample (same dummy class, real VQA metadata).
+        ds_vqa = _DummyBaseDataset(num_cams=1)
+        ds_vqa.meta = VQADatasetMetadata(info={"features": {}})
+        item_vqa = ds_vqa._to_standard_data_format(_full_raw_item(ds_vqa))
+
+        batch = default_collate([item_vla, item_vqa])
+        assert batch["fps"].dtype == torch.long
+        assert batch["fps"].tolist() == [30, 0]
+        assert batch["fps_is_pad"].tolist() == [False, True]

From 1467b57435f55f03b70ff98573e1c1872eda43a6 Mon Sep 17 00:00:00 2001
From: Shuheng Liu <wish1104@icloud.com>
Date: Fri, 22 May 2026 20:18:01 -0700
Subject: [PATCH 6/7] test+docs(fps): pin mixed-pad batch + note
 metadata_drop_all semantic shift
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add `test_fps_mixed_pad_across_batch` in all three fps test classes —
  closes the per-sample fps_is_pad coverage gap, now reached in
  production by heterogeneous LeRobot + VQA mixtures where the VQA pad
  row (fps=0, fps_is_pad=True) sits next to a LeRobot row in the same
  batch.
- concepts.rst: document the metadata_drop_all_prob=1.0 semantic shift
  under emit_fps=True (no longer "no metadata at all" — it's "fps-only
  metadata") and note metadata_max_length=52 headroom.
---
 docs/source/concepts.rst                      | 21 +++++++++++++-
 tests/policies/test_pi07_cpu.py               | 29 +++++++++++++++++++
 .../test_pi07_paligemma_high_level_planner.py | 18 ++++++++++++
 .../policies/test_pi07_paligemma_low_level.py | 18 ++++++++++++
 4 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst
index b0d790f9..f67f4b97 100644
--- a/docs/source/concepts.rst
+++ b/docs/source/concepts.rst
@@ -132,7 +132,14 @@ see :ref:`Training-time dropout <standard-data-format-optional-keys-dropout>`.
 
 ``fps`` is the **effective per-sample frame rate** of the (possibly
 resampled) action chunk: ``DatasetMixtureConfig.action_freq`` when set,
-otherwise the dataset's native ``meta.fps``. Heterogeneous-frequency
+otherwise the dataset's native ``meta.fps``. The tokenized ``FPS: N, ``
+segment adds ~3-4 BPE tokens to the metadata prefix, which still fits
+comfortably inside the default ``metadata_max_length=52`` — but long
+``robot_type`` strings combined with a fully-populated metadata batch
+leave less headroom than before, and the underlying tokenizer call uses
+``truncation=True`` silently. Bump ``metadata_max_length`` (a field on
+each pi07 / pi07_paligemma config) if you start seeing the trailing
+``Control:`` segment get clipped. Heterogeneous-frequency
 mixtures (``action_freq=None``) need it so the policy can condition on
 each sample's rate — a 30 Hz chunk and a 50 Hz chunk carry different
 real-time horizons even when both are ``chunk_size`` frames long. Unlike
@@ -289,6 +296,18 @@ for reproducibility.
      - Per-field independent mask roll for each of ``speed``,
        ``mistake``, ``quality``, ``robot_type``, ``control_mode``.
        Only rolled when the shared drop did not fire.
+
+.. note::
+
+   ``fps`` is **not** in either drop pool — when ``emit_fps=True`` it
+   stays non-pad for every LeRobot sample regardless of the rolls. This
+   shifts the semantics of ``metadata_drop_all_prob=1.0``: pre-PR it
+   meant "no metadata segment at all", post-PR it means "fps-only
+   metadata segment" (the policy's ``has_metadata`` branch sees a
+   non-empty metadata mask and keeps the ``Metadata: FPS: N, `` block
+   in the prefix). If you're running a no-metadata ablation, also set
+   ``emit_fps=False`` on the mixture (and ``EnvMetadataConfig.emit_fps=False``
+   at eval) to recover the pre-PR behaviour.
    * - ``val_enable_optional_key_dropout``
      - ``False``
      - Whether the five drop rolls above also fire on the **validation**
diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py
index c710f680..6b0d2a7e 100644
--- a/tests/policies/test_pi07_cpu.py
+++ b/tests/policies/test_pi07_cpu.py
@@ -684,6 +684,35 @@ def test_fps_with_partial_segments(self, planner):
         assert "FPS: 50, " in captured[1]
         assert "Control: joint, " in captured[1]
 
+    @pytest.mark.parametrize("planner", ["low", "high"])
+    def test_fps_mixed_pad_across_batch(self, planner):
+        """Per-sample ``fps_is_pad`` must be honored row-by-row — sample 0 with
+        ``fps_is_pad=False`` keeps its FPS segment while sample 1 with
+        ``fps_is_pad=True`` drops it. This is the production path now
+        triggered by heterogeneous LeRobot + VQA mixtures, where the VQA
+        pad row (``fps=0, fps_is_pad=True``) sits next to a LeRobot row
+        (``fps=30, fps_is_pad=False``) in the same batch.
+        """
+        method = self._planner_methods()[planner]
+        fake, captured = _make_fake_planner()
+
+        batch_size = 2
+        batch = {
+            "state": torch.zeros(batch_size, 1),
+            "fps": torch.tensor([30, 0], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False, True]),
+            "robot_type": ["franka", "franka"],
+        }
+
+        method(fake, batch)
+
+        # Sample 0 (LeRobot): non-pad → FPS segment present.
+        assert "FPS: 30, " in captured[0]
+        assert "Robot: franka, " in captured[0]
+        # Sample 1 (VQA): pad → FPS segment dropped, no stray "FPS: 0, ".
+        assert "FPS:" not in captured[1]
+        assert "Robot: franka, " in captured[1]
+
 
 # `embed_prefix` conditional-block guards
 #
diff --git a/tests/policies/test_pi07_paligemma_high_level_planner.py b/tests/policies/test_pi07_paligemma_high_level_planner.py
index bc6348a6..376ee61b 100644
--- a/tests/policies/test_pi07_paligemma_high_level_planner.py
+++ b/tests/policies/test_pi07_paligemma_high_level_planner.py
@@ -423,3 +423,21 @@ def test_fps_key_absent_omits_segment(self):
 
         for line in captured:
             assert "FPS:" not in line
+
+    def test_fps_mixed_pad_across_batch(self):
+        """Per-sample ``fps_is_pad`` must be honored row-by-row — production
+        path for heterogeneous LeRobot + VQA mixtures, where the VQA pad
+        row (``fps=0, fps_is_pad=True``) sits next to a LeRobot row
+        (``fps=30, fps_is_pad=False``) in the same batch.
+        """
+        method = PI07HighLevelPlannerPolicy.prepare_metadata
+        fake, captured = _pg_hl_make_fake_planner()
+
+        batch = _pg_hl_base_batch(batch_size=2)
+        batch["fps"] = torch.tensor([30, 0], dtype=torch.long)
+        batch["fps_is_pad"] = torch.tensor([False, True])
+
+        method(fake, batch)
+
+        assert "FPS: 30, " in captured[0]
+        assert "FPS:" not in captured[1]
diff --git a/tests/policies/test_pi07_paligemma_low_level.py b/tests/policies/test_pi07_paligemma_low_level.py
index aeced8f8..35254878 100644
--- a/tests/policies/test_pi07_paligemma_low_level.py
+++ b/tests/policies/test_pi07_paligemma_low_level.py
@@ -2381,3 +2381,21 @@ def test_fps_default_torch_long(self):
         assert len(captured) == 3
         for line in captured:
             assert "FPS: 25, " in line
+
+    def test_fps_mixed_pad_across_batch(self):
+        """Per-sample ``fps_is_pad`` must be honored row-by-row — production
+        path for heterogeneous LeRobot + VQA mixtures, where the VQA pad
+        row (``fps=0, fps_is_pad=True``) sits next to a LeRobot row
+        (``fps=30, fps_is_pad=False``) in the same batch.
+        """
+        policy, captured = _pg_ll_make_fake_policy()
+        batch = {
+            "state": torch.zeros(2, 1),
+            "fps": torch.tensor([30, 0], dtype=torch.long),
+            "fps_is_pad": torch.tensor([False, True]),
+        }
+
+        PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch)
+
+        assert "FPS: 30, " in captured[0]
+        assert "FPS:" not in captured[1]

From 60bc00c810b1eb84982ff85a49742ba22d0d70a9 Mon Sep 17 00:00:00 2001
From: Shuheng Liu <wish1104@icloud.com>
Date: Fri, 22 May 2026 20:35:29 -0700
Subject: [PATCH 7/7] feat!(datasets): flip emit_fps default to False (opt-in)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`DatasetMixtureConfig.emit_fps` and `EnvMetadataConfig.emit_fps` now
default to `False` so pre-PR checkpoints resume cleanly — the policy's
metadata prefix doesn't gain an unfamiliar `FPS:` segment until the
user explicitly opts in. `action_freq=None` (mixed-frequency training)
stays the default; opt-in to `emit_fps=True` to surface per-sample
fps conditioning to the policy.

BaseDataset's dm-None fallback matched to False for the "no mixture
config" path (VQA-only / unit tests). Docs + tests updated to reflect
the new default — existing fps tests now set `emit_fps=True`
explicitly when they exercise the emission path.
---
 docs/source/concepts.rst                | 41 +++++++++----------
 src/opentau/configs/default.py          | 40 ++++++++++++-------
 src/opentau/datasets/lerobot_dataset.py |  5 ++-
 src/opentau/envs/configs.py             |  9 +++--
 tests/configs/test_default.py           | 18 +++++----
 tests/datasets/test_optional_keys.py    | 43 ++++++++++++--------
 tests/envs/test_configs.py              | 15 +++----
 tests/envs/test_utils.py                | 52 ++++++++++++-------------
 8 files changed, 128 insertions(+), 95 deletions(-)

diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst
index f67f4b97..41d032d7 100644
--- a/docs/source/concepts.rst
+++ b/docs/source/concepts.rst
@@ -28,7 +28,7 @@ This class:
 *   Combines multiple ``LeRobotDataset`` and ``VQADataset`` instances.
 *   Different weights can be assigned to each dataset to control the sampling frequency; if weights are omitted (or set to ``null`` in JSON), weights default to dataset lengths.
 *   Aggregates statistics from all constituent datasets to ensure consistent normalization across the mixture.
-*   Resamples the action output frequency to match the ``action_freq`` specified in the configuration. When ``action_freq`` is ``None`` (the default), resampling is disabled — each dataset is sampled at its native fps and a single batch can mix samples spanning different real-time horizons (set ``action_freq`` to a positive float to re-anchor every dataset to a common rate). The per-sample effective rate is surfaced via the ``fps`` standard-format key (see :ref:`standard-data-format-optional-keys`) so the policy can condition on it.
+*   Resamples the action output frequency to match the ``action_freq`` specified in the configuration. When ``action_freq`` is ``None`` (the default), resampling is disabled — each dataset is sampled at its native fps and a single batch can mix samples spanning different real-time horizons (set ``action_freq`` to a positive float to re-anchor every dataset to a common rate). When running mixed-frequency training, opt in to ``DatasetMixtureConfig.emit_fps=True`` so the per-sample effective rate is surfaced via the ``fps`` standard-format key (see :ref:`standard-data-format-optional-keys`) and the policy can condition on it.
 
 Metadata
 ^^^^^^^^
@@ -150,11 +150,13 @@ real frame rate. Samples from VQA datasets (no temporal axis) emit
 ``fps=0, fps_is_pad=True`` so heterogeneous VLA + VQA mixtures stay
 schema-aligned across the batch; the policy's ``prepare_metadata``
 then drops the ``FPS:`` segment for those rows. Emission is gated by
-``DatasetMixtureConfig.emit_fps`` (default ``True``); set it to
-``False`` to omit both ``fps`` and ``fps_is_pad`` from the sample dict
-(useful when resuming a checkpoint that was trained without fps
-conditioning, so the policy's metadata prefix doesn't gain an unfamiliar
-``FPS:`` segment).
+``DatasetMixtureConfig.emit_fps`` (default ``False`` — pre-PR
+checkpoints resume cleanly because the policy's metadata prefix doesn't
+gain an unfamiliar ``FPS:`` segment). Flip to ``True`` for new training
+runs that want per-sample fps conditioning, especially heterogeneous
+mixtures where ``action_freq=None`` lets each dataset run at its native
+rate. At inference, ``EnvMetadataConfig.emit_fps`` (same default
+``False``) gates the eval-side broadcast of ``cfg.env.fps``.
 
 .. code-block:: python
 
@@ -213,13 +215,14 @@ conditioning, so the policy's metadata prefix doesn't gain an unfamiliar
                                        # resampled to that rate (via `resolve_delta_timestamps`) and `fps`
                                        # reports `action_freq`. When `action_freq is None` (the default),
                                        # the chunk runs at the dataset's native `meta.fps`. Gated by
-                                       # `DatasetMixtureConfig.emit_fps` (default `True`); the key is
-                                       # omitted entirely when `emit_fps=False`. Does NOT participate in
-                                       # `metadata_drop_*_prob` — fps is an intrinsic property of the
-                                       # chunk, not a noisy label.
-        "fps_is_pad": torch.BoolTensor,    # Always False when emitted (no dropout). Defaults to True only
-                                           # if a hand-built inference batch supplies `fps` without the
-                                           # paired flag — `prepare_metadata` then drops the segment.
+                                       # `DatasetMixtureConfig.emit_fps` (default `False` — opt-in); the
+                                       # key is omitted entirely when `emit_fps=False`. Does NOT
+                                       # participate in `metadata_drop_*_prob` — fps is an intrinsic
+                                       # property of the chunk, not a noisy label.
+        "fps_is_pad": torch.BoolTensor,    # Always False when emitted from a real-rate sample. VQA samples
+                                           # (no temporal axis) emit `fps_is_pad=True` so heterogeneous
+                                           # VLA + VQA batches stay schema-aligned; `prepare_metadata`
+                                           # then drops the `FPS:` segment for those rows.
 
         "subgoal0": torch.Tensor,       # shape (3, H, W), values in [0,1]. A single future frame from
                                         # camera0 sampled either at end-of-segment (with probability
@@ -301,13 +304,11 @@ for reproducibility.
 
    ``fps`` is **not** in either drop pool — when ``emit_fps=True`` it
    stays non-pad for every LeRobot sample regardless of the rolls. This
-   shifts the semantics of ``metadata_drop_all_prob=1.0``: pre-PR it
-   meant "no metadata segment at all", post-PR it means "fps-only
-   metadata segment" (the policy's ``has_metadata`` branch sees a
-   non-empty metadata mask and keeps the ``Metadata: FPS: N, `` block
-   in the prefix). If you're running a no-metadata ablation, also set
-   ``emit_fps=False`` on the mixture (and ``EnvMetadataConfig.emit_fps=False``
-   at eval) to recover the pre-PR behaviour.
+   means under ``emit_fps=True``, ``metadata_drop_all_prob=1.0`` produces
+   a "fps-only metadata segment" rather than "no metadata segment at
+   all" (the policy's ``has_metadata`` branch sees a non-empty metadata
+   mask and keeps the ``Metadata: FPS: N, `` block in the prefix). For a
+   true no-metadata ablation, keep the default ``emit_fps=False``.
    * - ``val_enable_optional_key_dropout``
      - ``False``
      - Whether the five drop rolls above also fire on the **validation**
diff --git a/src/opentau/configs/default.py b/src/opentau/configs/default.py
index 19b723ae..de2e7a87 100644
--- a/src/opentau/configs/default.py
+++ b/src/opentau/configs/default.py
@@ -234,15 +234,23 @@ class DatasetMixtureConfig:
             must have a non-empty ``control_mode`` after the optional
             ``DatasetConfig.control_mode`` override has been applied. Defaults
             to ``False`` (empty / missing values are allowed).
-        emit_fps: Whether ``__getitem__`` returns the dataset's native
-            ``meta.fps`` as the ``fps`` metadata key (``torch.long`` scalar,
-            paired with ``fps_is_pad=False``). Default ``True``. When
-            ``False``, neither ``fps`` nor ``fps_is_pad`` appears in the
-            sample dict and the pi07 / pi07_paligemma policy prefix omits
-            the ``fps:`` segment. Unlike the other metadata fields, ``fps``
-            is **not** rolled by ``metadata_drop_*_prob`` — it's an intrinsic
-            property of the dataset, not a noisy label, so it is always
-            present when ``emit_fps=True``.
+        emit_fps: Whether ``__getitem__`` returns the *effective*
+            per-sample frame rate (``action_freq`` if set, else the
+            dataset's native ``meta.fps``) as the ``fps`` metadata key
+            (``torch.long`` scalar, paired with ``fps_is_pad=False``).
+            Default ``False`` — fps conditioning is an opt-in feature so
+            pre-PR checkpoints resume without the policy's metadata
+            prefix gaining an unfamiliar ``FPS:`` segment. Flip to
+            ``True`` for new training runs that want per-sample
+            frame-rate conditioning (especially heterogeneous-frequency
+            mixtures where ``action_freq=None`` lets each dataset run at
+            its native rate). Unlike the other metadata fields, ``fps``
+            is **not** rolled by ``metadata_drop_*_prob`` — it's an
+            intrinsic property of the chunk, not a noisy label, so it
+            is always present (non-pad) for LeRobot samples when
+            ``emit_fps=True``. VQA samples (no temporal axis) emit
+            ``fps=0, fps_is_pad=True`` regardless so heterogeneous
+            VLA + VQA batches stay schema-aligned.
         tolerance_s: Mixture-wide default tolerance (in seconds) for the
             load-time ``check_timestamps_sync`` call inside
             ``LeRobotDataset.__init__``. Each dataset's frame-to-frame
@@ -314,11 +322,15 @@ class DatasetMixtureConfig:
     require_non_empty_robot_type: bool = False
     require_non_empty_control_mode: bool = False
 
-    # Whether `__getitem__` emits the dataset's native fps as the `fps`
-    # metadata key. Independent of `metadata_drop_*_prob` — fps is intrinsic
-    # to the dataset, not a noisy label, so it is always present (never
-    # padded) when this is True.
-    emit_fps: bool = True
+    # Whether `__getitem__` emits the effective per-sample fps as the `fps`
+    # metadata key. Default `False` so pre-PR checkpoints resume cleanly
+    # (no new `FPS:` segment in the policy's metadata prefix). Flip to
+    # `True` for new training runs that want per-sample fps conditioning;
+    # especially relevant for heterogeneous-frequency mixtures
+    # (`action_freq=None`). Independent of `metadata_drop_*_prob` — fps
+    # is intrinsic to the chunk, not a noisy label, so it is always
+    # present (never padded) for LeRobot samples when this is True.
+    emit_fps: bool = False
 
     # Mixture-wide defaults for the load-time timestamp-sync check. Each
     # dataset can override these via `DatasetConfig.{tolerance_s,
diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py
index b756f821..b3ee9a32 100644
--- a/src/opentau/datasets/lerobot_dataset.py
+++ b/src/opentau/datasets/lerobot_dataset.py
@@ -721,7 +721,10 @@ def __init__(self, cfg: TrainPipelineConfig):
         # is True: the mixture's `action_freq` if set (every dataset has been
         # resampled to that rate via `resolve_delta_timestamps`), else the
         # dataset's native `meta.fps`. Independent of the dropout rolls above.
-        self.emit_fps = dm.emit_fps if dm else True
+        # Default-False fallback matches `DatasetMixtureConfig.emit_fps` so
+        # the "no mixture config" path (VQA-only / unit tests) behaves the
+        # same as the explicit-mixture default.
+        self.emit_fps = dm.emit_fps if dm else False
         self._action_freq = dm.action_freq if dm else None
         # Whether the above drop rolls actually fire. `make_dataset` flips this
         # off on the validation subset (unless `val_enable_optional_key_dropout`
diff --git a/src/opentau/envs/configs.py b/src/opentau/envs/configs.py
index 57bb09cd..cb0aef31 100644
--- a/src/opentau/envs/configs.py
+++ b/src/opentau/envs/configs.py
@@ -79,9 +79,10 @@ class EnvMetadataConfig:
         emit_fps: Whether to broadcast :attr:`EnvConfig.fps` as the ``fps``
             metadata field at inference (paralleling
             :attr:`DatasetMixtureConfig.emit_fps` at training time).
-            Defaults to ``True``. When ``False``, the policy prefix omits
-            the ``fps:`` segment — useful when resuming a checkpoint
-            trained without fps conditioning.
+            Defaults to ``False`` — fps conditioning is opt-in so old
+            checkpoints resume cleanly (no surprise ``FPS:`` segment in
+            the policy's metadata prefix). Flip to ``True`` for
+            checkpoints trained with the training-side ``emit_fps=True``.
     """
 
     speed: int | None = None
@@ -89,7 +90,7 @@ class EnvMetadataConfig:
     mistake: bool | None = None
     robot_type: str | None = None
     control_mode: ControlMode | None = None
-    emit_fps: bool = True
+    emit_fps: bool = False
 
     def __post_init__(self) -> None:
         # `isinstance(x, bool)` guards exclude Python bools — `bool` is a
diff --git a/tests/configs/test_default.py b/tests/configs/test_default.py
index ef544129..3f3063b1 100644
--- a/tests/configs/test_default.py
+++ b/tests/configs/test_default.py
@@ -99,16 +99,20 @@ def test_action_freq_positive_still_valid():
     assert cfg.action_freq == 30.0
 
 
-def test_emit_fps_defaults_to_true():
-    """The new `emit_fps` toggle defaults to True so the policy gets fps conditioning out of the box."""
+def test_emit_fps_defaults_to_false():
+    """The new `emit_fps` toggle is opt-in — pre-PR checkpoints resume cleanly
+    because the policy's metadata prefix doesn't gain an unfamiliar ``FPS:``
+    segment by default. New training runs that want per-sample fps
+    conditioning set `emit_fps=True` explicitly.
+    """
     cfg = DatasetMixtureConfig()
-    assert cfg.emit_fps is True
+    assert cfg.emit_fps is False
 
 
-def test_emit_fps_can_be_disabled():
-    """Setting `emit_fps=False` should be accepted (legacy-checkpoint resume escape hatch)."""
-    cfg = DatasetMixtureConfig(emit_fps=False)
-    assert cfg.emit_fps is False
+def test_emit_fps_can_be_enabled():
+    """Setting `emit_fps=True` opts in to per-sample fps tokenization."""
+    cfg = DatasetMixtureConfig(emit_fps=True)
+    assert cfg.emit_fps is True
 
 
 def test_invalid_image_resample_strategy_raises_error():
diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py
index 81747bfa..02ecde46 100644
--- a/tests/datasets/test_optional_keys.py
+++ b/tests/datasets/test_optional_keys.py
@@ -64,7 +64,7 @@ def __init__(
         response_drop_prob=0.0,
         metadata_drop_all_prob=0.0,
         metadata_drop_each_prob=0.0,
-        emit_fps=True,
+        emit_fps=False,
         action_freq: float | None = None,
         meta_info: dict | None = None,
         meta_fps: int = 30,
@@ -699,35 +699,44 @@ def test_emitted_as_python_strings_for_default_collate(self):
 
 
 # `fps` is emitted as an intrinsic dataset property — always present (non-pad)
-# when `emit_fps=True`, omitted entirely when `emit_fps=False`. Unlike speed /
-# quality / mistake / robot_type / control_mode it does NOT participate in any
-# of the `metadata_drop_*_prob` dropout rolls.
+# when `emit_fps=True`, omitted entirely when `emit_fps=False` (the default).
+# Unlike speed / quality / mistake / robot_type / control_mode it does NOT
+# participate in any of the `metadata_drop_*_prob` dropout rolls.
 
 
 class TestFpsEmission:
     def test_fps_emitted_with_meta_fps_value(self):
-        ds = _DummyBaseDataset(num_cams=1, meta_fps=20)
+        ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=20)
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert out["fps"].dtype == torch.long
         assert out["fps"].item() == 20
         assert out["fps_is_pad"].item() is False
 
     def test_fps_emitted_as_torch_long_dtype(self):
-        ds = _DummyBaseDataset(num_cams=1, meta_fps=50)
+        ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50)
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert out["fps"].dtype == torch.long
 
     def test_fps_omitted_when_emit_fps_false(self):
+        # The default — pre-PR behaviour preserved.
         ds = _DummyBaseDataset(num_cams=1, emit_fps=False, meta_fps=20)
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert "fps" not in out
         assert "fps_is_pad" not in out
 
+    def test_fps_omitted_by_default(self):
+        """The fixture default mirrors production: ``emit_fps=False`` ⇒ no fps keys."""
+        ds = _DummyBaseDataset(num_cams=1, meta_fps=20)
+        out = ds._to_standard_data_format(_full_raw_item(ds))
+        assert "fps" not in out
+        assert "fps_is_pad" not in out
+
     def test_fps_unaffected_by_metadata_drop_all_prob(self):
         # fps is intrinsic to the dataset, not a noisy label — it must NOT
         # participate in `metadata_drop_*_prob` rolls.
         ds = _DummyBaseDataset(
             num_cams=1,
+            emit_fps=True,
             meta_fps=15,
             metadata_drop_all_prob=1.0,
             metadata_drop_each_prob=1.0,
@@ -747,6 +756,7 @@ def test_fps_unaffected_by_disabled_dropout(self):
         # the same emit behavior — fps is gated by `emit_fps`, not by dropout.
         ds = _DummyBaseDataset(
             num_cams=1,
+            emit_fps=True,
             meta_fps=25,
             metadata_drop_all_prob=1.0,
             metadata_drop_each_prob=1.0,
@@ -762,8 +772,8 @@ def test_default_collate_stacks_mixed_fps(self):
         tensor and `fps_is_pad` becomes a (B,) bool tensor."""
         from torch.utils.data import default_collate
 
-        ds_a = _DummyBaseDataset(num_cams=1, meta_fps=30)
-        ds_b = _DummyBaseDataset(num_cams=1, meta_fps=50)
+        ds_a = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=30)
+        ds_b = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50)
         item_a = ds_a._to_standard_data_format(_full_raw_item(ds_a))
         item_b = ds_b._to_standard_data_format(_full_raw_item(ds_b))
         batch = default_collate([item_a, item_b])
@@ -779,7 +789,7 @@ def test_fps_reports_action_freq_when_resampling(self):
         different timing than the chunk actually carries.
         """
         # Dataset is natively 50 Hz, but the mixture pins everything to 30 Hz.
-        ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=30.0)
+        ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50, action_freq=30.0)
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert out["fps"].item() == 30
         assert out["fps_is_pad"].item() is False
@@ -787,7 +797,7 @@ def test_fps_reports_action_freq_when_resampling(self):
     def test_fps_reports_native_when_action_freq_none(self):
         """When `action_freq is None` (no resampling), the chunk runs at the
         dataset's native rate — that's what gets emitted."""
-        ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=None)
+        ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50, action_freq=None)
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert out["fps"].item() == 50
         assert out["fps_is_pad"].item() is False
@@ -797,8 +807,9 @@ def test_fps_reports_native_when_action_freq_none(self):
 # on the base class (and on ``VQADatasetMetadata``, which inherits via ``pass``).
 # `_emit_optional_keys` must catch that and emit ``fps=0, fps_is_pad=True``
 # instead of crashing on ``int(None)`` — otherwise any heterogeneous mixture
-# that includes a VQA dataset crashes at the first sample fetch under the
-# default ``emit_fps=True``.
+# with ``emit_fps=True`` crashes at the first VQA sample fetch. (Under the
+# default ``emit_fps=False`` the path is a no-op; these tests opt in to
+# exercise the heterogeneous-batch behaviour explicitly.)
 
 
 class TestFpsEmissionVQA:
@@ -819,7 +830,7 @@ def test_vqa_metadata_emits_pad_row(self):
         """
         from opentau.datasets.lerobot_dataset import VQADatasetMetadata
 
-        ds = _DummyBaseDataset(num_cams=1)
+        ds = _DummyBaseDataset(num_cams=1, emit_fps=True)
         ds.meta = VQADatasetMetadata(info={"features": {}})
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert "fps" in out and "fps_is_pad" in out
@@ -833,7 +844,7 @@ def test_vqa_pad_ignores_action_freq(self):
         ``action_freq`` would be misleading."""
         from opentau.datasets.lerobot_dataset import VQADatasetMetadata
 
-        ds = _DummyBaseDataset(num_cams=1, action_freq=30.0)
+        ds = _DummyBaseDataset(num_cams=1, emit_fps=True, action_freq=30.0)
         ds.meta = VQADatasetMetadata(info={"features": {}})
         out = ds._to_standard_data_format(_full_raw_item(ds))
         assert out["fps"].item() == 0
@@ -850,11 +861,11 @@ def test_heterogeneous_vla_vqa_collate(self):
         from opentau.datasets.lerobot_dataset import VQADatasetMetadata
 
         # LeRobot-shaped sample.
-        ds_vla = _DummyBaseDataset(num_cams=1, meta_fps=30)
+        ds_vla = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=30)
         item_vla = ds_vla._to_standard_data_format(_full_raw_item(ds_vla))
 
         # VQA-shaped sample (same dummy class, real VQA metadata).
-        ds_vqa = _DummyBaseDataset(num_cams=1)
+        ds_vqa = _DummyBaseDataset(num_cams=1, emit_fps=True)
         ds_vqa.meta = VQADatasetMetadata(info={"features": {}})
         item_vqa = ds_vqa._to_standard_data_format(_full_raw_item(ds_vqa))
 
diff --git a/tests/envs/test_configs.py b/tests/envs/test_configs.py
index f475abd9..a7952e79 100644
--- a/tests/envs/test_configs.py
+++ b/tests/envs/test_configs.py
@@ -189,17 +189,18 @@ def test_all_none_default_passes(self):
         assert cfg.robot_type is None
         assert cfg.control_mode is None
         # `emit_fps` is a boolean toggle (not a None-presence field) and
-        # defaults to True — the policy receives an fps segment at eval
-        # unless the user explicitly opts out.
-        assert cfg.emit_fps is True
-
-    def test_emit_fps_can_be_disabled(self):
-        cfg = EnvMetadataConfig(emit_fps=False)
+        # defaults to False — fps conditioning is opt-in so pre-PR
+        # checkpoints resume cleanly without an unfamiliar ``FPS:``
+        # segment in the metadata prefix.
         assert cfg.emit_fps is False
 
+    def test_emit_fps_can_be_enabled(self):
+        cfg = EnvMetadataConfig(emit_fps=True)
+        assert cfg.emit_fps is True
+
     def test_emit_fps_kept_orthogonal_to_other_fields(self):
         """Setting ``emit_fps`` must not perturb the other fields' defaults."""
-        cfg = EnvMetadataConfig(emit_fps=False)
+        cfg = EnvMetadataConfig(emit_fps=True)
         assert cfg.speed is None
         assert cfg.quality is None
         assert cfg.mistake is None
diff --git a/tests/envs/test_utils.py b/tests/envs/test_utils.py
index ebb865a8..0bca2eb5 100644
--- a/tests/envs/test_utils.py
+++ b/tests/envs/test_utils.py
@@ -232,31 +232,31 @@ class TestAddEvalMetadata:
     silently changes the prefix the model sees at eval.
     """
 
-    def test_all_none_default_skips_user_keys_but_still_emits_fps(self):
+    def test_all_none_default_skips_every_key(self):
         """``EnvMetadataConfig`` defaults: every user-controlled field is
-        ``None`` *but* ``emit_fps`` defaults to ``True``, so the helper still
-        injects ``fps`` / ``fps_is_pad`` from ``cfg.env.fps``.
+        ``None`` and ``emit_fps`` defaults to ``False``, so the helper
+        injects **no** metadata keys at all. This preserves pre-PR
+        behaviour for any eval config that doesn't opt in.
         """
         obs = _make_obs()
         original_keys = set(obs.keys())
         out = add_eval_metadata(obs, cfg=_make_cfg())
         assert out is obs  # mutated-in-place contract
-        new_keys = set(out.keys()) - original_keys
-        assert new_keys == {"fps", "fps_is_pad"}, (
-            f"unexpected injected keys: {new_keys}; only fps/fps_is_pad should appear "
-            f"under the all-None metadata default"
+        assert set(out.keys()) == original_keys, (
+            f"unexpected injected keys under all-None metadata default: {set(out.keys()) - original_keys}"
         )
 
-    def test_emit_fps_false_skips_fps(self):
-        """``emit_fps=False`` on ``EnvMetadataConfig`` reverts the helper to the
-        legacy "skip all when nothing set" behaviour — useful when resuming a
-        checkpoint trained without fps conditioning.
+    def test_emit_fps_true_injects_fps(self):
+        """``emit_fps=True`` opts in to broadcasting ``cfg.env.fps`` as a
+        ``(B,)`` torch.long tensor with ``fps_is_pad=False``.
         """
         obs = _make_obs()
         original_keys = set(obs.keys())
-        out = add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False))
-        assert set(out.keys()) == original_keys, (
-            "no keys should be injected when emit_fps=False and every metadata field is None"
+        out = add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True))
+        new_keys = set(out.keys()) - original_keys
+        assert new_keys == {"fps", "fps_is_pad"}, (
+            f"unexpected injected keys: {new_keys}; emit_fps=True with no other "
+            f"metadata fields should inject only fps/fps_is_pad"
         )
 
     @pytest.mark.parametrize("batch_size", [1, 4])
@@ -317,10 +317,10 @@ def test_string_fields_broadcast_as_list(self, key, value):
         assert f"{key}_is_pad" not in obs, "string fields use empty-string as pad signal, not a flag"
 
     def test_partial_fields_only_inject_specified_keys(self):
-        # `emit_fps=False` so this test isolates the user-set fields from the
-        # default fps injection (covered separately in test_emit_fps_*).
+        # `emit_fps` defaults to False, so this exercises the user-set fields
+        # in isolation from the fps emission path (covered separately).
         obs = _make_obs()
-        add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5"))
+        add_eval_metadata(obs, cfg=_make_cfg(speed=30, robot_type="UR5"))
 
         assert "speed" in obs and "speed_is_pad" in obs
         assert "robot_type" in obs
@@ -340,26 +340,25 @@ def test_device_propagation(self):
 
         Using "meta" device makes this CPU-host portable (no real CUDA
         required) while still exercising the device-routing branch in the
-        helper.
+        helper. Opts in to ``emit_fps=True`` so the fps device-routing
+        branch is also exercised.
         """
         obs = _make_obs(batch_size=2, device="meta")
-        add_eval_metadata(obs, cfg=_make_cfg(speed=20, mistake=True))
+        add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True, speed=20, mistake=True))
 
         assert obs["speed"].device.type == "meta"
         assert obs["speed_is_pad"].device.type == "meta"
         assert obs["mistake"].device.type == "meta"
         assert obs["mistake_is_pad"].device.type == "meta"
-        # fps gets injected too under the default `emit_fps=True`; it must
-        # share the same device as state.
         assert obs["fps"].device.type == "meta"
         assert obs["fps_is_pad"].device.type == "meta"
 
     @pytest.mark.parametrize("batch_size", [1, 4])
     def test_fps_injects_long_tensor_from_env_fps(self, batch_size):
-        """``emit_fps=True`` (default) → broadcast ``cfg.env.fps`` as a
+        """``emit_fps=True`` opt-in → broadcast ``cfg.env.fps`` as a
         ``(B,)`` torch.long tensor with ``fps_is_pad=False`` rows."""
         obs = _make_obs(batch_size=batch_size)
-        add_eval_metadata(obs, cfg=_make_cfg(fps=20))  # LiberoEnv-style fps
+        add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True, fps=20))  # LiberoEnv-style fps
 
         assert obs["fps"].dtype == torch.long
         assert obs["fps"].shape == (batch_size,)
@@ -373,13 +372,14 @@ def test_fps_value_comes_from_env_not_metadata(self):
         """The fps value source is ``cfg.env.fps`` (env stepping freq), NOT a
         field on ``EnvMetadataConfig`` (which only carries the on/off toggle)."""
         obs = _make_obs(batch_size=2)
-        add_eval_metadata(obs, cfg=_make_cfg(fps=50))
+        add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True, fps=50))
 
         assert torch.equal(obs["fps"], torch.full((2,), 50, dtype=torch.long))
 
     def test_fps_disabled_skips_only_fps_keys(self):
-        """``emit_fps=False`` skips the fps injection but leaves other set
-        fields alone — they go through the regular None-presence gating."""
+        """``emit_fps=False`` (the default) skips the fps injection but leaves
+        other set fields alone — they go through the regular None-presence
+        gating. The explicit ``emit_fps=False`` documents intent."""
         obs = _make_obs(batch_size=2)
         add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5"))