TensorAuto · shuheng-liu · May 23, 2026 · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst
@@ -28,7 +28,7 @@ This class:
 *   Combines multiple ``LeRobotDataset`` and ``VQADataset`` instances.
 *   Different weights can be assigned to each dataset to control the sampling frequency; if weights are omitted (or set to ``null`` in JSON), weights default to dataset lengths.
 *   Aggregates statistics from all constituent datasets to ensure consistent normalization across the mixture.
-*   Resamples the action output frequency to match the action frequency specified in the configuration.
+*   Resamples the action output frequency to match the ``action_freq`` specified in the configuration. When ``action_freq`` is ``None`` (the default), resampling is disabled — each dataset is sampled at its native fps and a single batch can mix samples spanning different real-time horizons (set ``action_freq`` to a positive float to re-anchor every dataset to a common rate). When running mixed-frequency training, opt in to ``DatasetMixtureConfig.emit_fps=True`` so the per-sample effective rate is surfaced via the ``fps`` standard-format key (see :ref:`standard-data-format-optional-keys`) and the policy can condition on it.
 
 Metadata
 ^^^^^^^^
@@ -130,6 +130,34 @@ Like ``speed``, ``mistake``, and ``quality``, they participate in the
 ``metadata_drop_all_prob`` / ``metadata_drop_each_prob`` dropout rolls —
 see :ref:`Training-time dropout <standard-data-format-optional-keys-dropout>`.
 
+``fps`` is the **effective per-sample frame rate** of the (possibly
+resampled) action chunk: ``DatasetMixtureConfig.action_freq`` when set,
+otherwise the dataset's native ``meta.fps``. The tokenized ``FPS: N, ``
+segment adds ~3-4 BPE tokens to the metadata prefix, which still fits
+comfortably inside the default ``metadata_max_length=52`` — but long
+``robot_type`` strings combined with a fully-populated metadata batch
+leave less headroom than before, and the underlying tokenizer call uses
+``truncation=True`` silently. Bump ``metadata_max_length`` (a field on
+each pi07 / pi07_paligemma config) if you start seeing the trailing
+``Control:`` segment get clipped. Heterogeneous-frequency
+mixtures (``action_freq=None``) need it so the policy can condition on
+each sample's rate — a 30 Hz chunk and a 50 Hz chunk carry different
+real-time horizons even when both are ``chunk_size`` frames long. Unlike
+the other metadata fields, ``fps`` does **not** participate in the
+dropout rolls — it's an intrinsic property of the chunk, not a noisy
+label, so it's always non-pad when emitted from a dataset that has a
+real frame rate. Samples from VQA datasets (no temporal axis) emit
+``fps=0, fps_is_pad=True`` so heterogeneous VLA + VQA mixtures stay
+schema-aligned across the batch; the policy's ``prepare_metadata``
+then drops the ``FPS:`` segment for those rows. Emission is gated by
+``DatasetMixtureConfig.emit_fps`` (default ``False`` — pre-PR
+checkpoints resume cleanly because the policy's metadata prefix doesn't
+gain an unfamiliar ``FPS:`` segment). Flip to ``True`` for new training
+runs that want per-sample fps conditioning, especially heterogeneous
+mixtures where ``action_freq=None`` lets each dataset run at its native
+rate. At inference, ``EnvMetadataConfig.emit_fps`` (same default
+``False``) gates the eval-side broadcast of ``cfg.env.fps``.
+
 .. code-block:: python
 
     {
@@ -182,6 +210,20 @@ see :ref:`Training-time dropout <standard-data-format-optional-keys-dropout>`.
         "quality": torch.LongTensor,   # Scalar in {1,2,3,4,5}; episode-level quality score.
         "quality_is_pad": torch.BoolTensor,
 
+        "fps": torch.LongTensor,       # Scalar; effective per-sample frame rate of the action chunk.
+                                       # When `DatasetMixtureConfig.action_freq` is set, every dataset is
+                                       # resampled to that rate (via `resolve_delta_timestamps`) and `fps`
+                                       # reports `action_freq`. When `action_freq is None` (the default),
+                                       # the chunk runs at the dataset's native `meta.fps`. Gated by
+                                       # `DatasetMixtureConfig.emit_fps` (default `False` — opt-in); the
+                                       # key is omitted entirely when `emit_fps=False`. Does NOT
+                                       # participate in `metadata_drop_*_prob` — fps is an intrinsic
+                                       # property of the chunk, not a noisy label.
+        "fps_is_pad": torch.BoolTensor,    # Always False when emitted from a real-rate sample. VQA samples
+                                           # (no temporal axis) emit `fps_is_pad=True` so heterogeneous
+                                           # VLA + VQA batches stay schema-aligned; `prepare_metadata`
+                                           # then drops the `FPS:` segment for those rows.
+
         "subgoal0": torch.Tensor,       # shape (3, H, W), values in [0,1]. A single future frame from
                                         # camera0 sampled either at end-of-segment (with probability
                                         # `subgoal_end_of_segment_prob`) or uniformly in [t, t+4 seconds].
@@ -257,6 +299,16 @@ for reproducibility.
      - Per-field independent mask roll for each of ``speed``,
        ``mistake``, ``quality``, ``robot_type``, ``control_mode``.
        Only rolled when the shared drop did not fire.
+
+.. note::
+
+   ``fps`` is **not** in either drop pool — when ``emit_fps=True`` it
+   stays non-pad for every LeRobot sample regardless of the rolls. This
+   means under ``emit_fps=True``, ``metadata_drop_all_prob=1.0`` produces
+   a "fps-only metadata segment" rather than "no metadata segment at
+   all" (the policy's ``has_metadata`` branch sees a non-empty metadata
+   mask and keeps the ``Metadata: FPS: N, `` block in the prefix). For a
+   true no-metadata ablation, keep the default ``emit_fps=False``.
    * - ``val_enable_optional_key_dropout``
      - ``False``
      - Whether the five drop rolls above also fire on the **validation**

diff --git a/src/opentau/configs/default.py b/src/opentau/configs/default.py
@@ -174,7 +174,14 @@ class DatasetMixtureConfig:
             same length as `datasets` when provided. If None, weights are inferred
             from dataset lengths. Defaults to None.
         action_freq: Frequency at which actions from the dataset mixture are
-            resampled, in Hz. Defaults to 30.0.
+            resampled, in Hz. ``None`` (default) disables resampling — each
+            dataset is sampled at its native fps, so a single batch can mix
+            samples from sources running at different rates (predicting
+            ``chunk_size`` consecutive native frames per sample). Set a
+            positive float to resample every dataset in the mixture to that
+            common rate via nearest-neighbor frame selection. When using
+            ``None``, prefer also setting ``emit_fps=True`` so the policy
+            can condition on the per-sample rate.
         image_resample_strategy: Resample strategy for image features. Must be
             one of 'linear' or 'nearest'. Defaults to 'nearest'.
         vector_resample_strategy: Resample strategy for non-image features, such
@@ -227,6 +234,23 @@ class DatasetMixtureConfig:
             must have a non-empty ``control_mode`` after the optional
             ``DatasetConfig.control_mode`` override has been applied. Defaults
             to ``False`` (empty / missing values are allowed).
+        emit_fps: Whether ``__getitem__`` returns the *effective*
+            per-sample frame rate (``action_freq`` if set, else the
+            dataset's native ``meta.fps``) as the ``fps`` metadata key
+            (``torch.long`` scalar, paired with ``fps_is_pad=False``).
+            Default ``False`` — fps conditioning is an opt-in feature so
+            pre-PR checkpoints resume without the policy's metadata
+            prefix gaining an unfamiliar ``FPS:`` segment. Flip to
+            ``True`` for new training runs that want per-sample
+            frame-rate conditioning (especially heterogeneous-frequency
+            mixtures where ``action_freq=None`` lets each dataset run at
+            its native rate). Unlike the other metadata fields, ``fps``
+            is **not** rolled by ``metadata_drop_*_prob`` — it's an
+            intrinsic property of the chunk, not a noisy label, so it
+            is always present (non-pad) for LeRobot samples when
+            ``emit_fps=True``. VQA samples (no temporal axis) emit
+            ``fps=0, fps_is_pad=True`` regardless so heterogeneous
+            VLA + VQA batches stay schema-aligned.
         tolerance_s: Mixture-wide default tolerance (in seconds) for the
             load-time ``check_timestamps_sync`` call inside
             ``LeRobotDataset.__init__``. Each dataset's frame-to-frame
@@ -254,9 +278,9 @@ class DatasetMixtureConfig:
 
     Raises:
         ValueError: If `weights` is provided and its length doesn't match
-            `datasets`, if `action_freq` is not positive, if resample
-            strategies are invalid, or if any drop probability is outside
-            ``[0, 1]``.
+            `datasets`, if `action_freq` is not None and not positive, if
+            resample strategies are invalid, or if any drop probability is
+            outside ``[0, 1]``.
     """
 
     # List of dataset configs to be used in the mixture.
@@ -265,7 +289,8 @@ class DatasetMixtureConfig:
     # Must be the same length as `datasets` when provided.
     weights: list[float] | None = None
     # Frequency at which the actions from dataset mixture are resampled, in Hz.
-    action_freq: float = 30.0
+    # ``None`` disables resampling — each dataset is sampled at its native fps.
+    action_freq: float | None = None
     # Resample strategy for image features
     image_resample_strategy: str = "nearest"
     # Resample strategy for non-image features, such as action or state
@@ -297,6 +322,16 @@ class DatasetMixtureConfig:
     require_non_empty_robot_type: bool = False
     require_non_empty_control_mode: bool = False
 
+    # Whether `__getitem__` emits the effective per-sample fps as the `fps`
+    # metadata key. Default `False` so pre-PR checkpoints resume cleanly
+    # (no new `FPS:` segment in the policy's metadata prefix). Flip to
+    # `True` for new training runs that want per-sample fps conditioning;
+    # especially relevant for heterogeneous-frequency mixtures
+    # (`action_freq=None`). Independent of `metadata_drop_*_prob` — fps
+    # is intrinsic to the chunk, not a noisy label, so it is always
+    # present (never padded) for LeRobot samples when this is True.
+    emit_fps: bool = False
+
     # Mixture-wide defaults for the load-time timestamp-sync check. Each
     # dataset can override these via `DatasetConfig.{tolerance_s,
     # skip_timestamp_check}`. The default tolerance matches
@@ -308,8 +343,8 @@ def __post_init__(self):
         """Validate dataset mixture configuration."""
         if self.weights is not None and len(self.datasets) != len(self.weights):
             raise ValueError("The length of `weights` must match the length of `datasets`.")
-        if self.action_freq <= 0:
-            raise ValueError(f"`action_freq` must be a positive number, got {self.action_freq}.")
+        if self.action_freq is not None and self.action_freq <= 0:
+            raise ValueError(f"`action_freq` must be a positive number or None, got {self.action_freq}.")
         if self.image_resample_strategy not in ["linear", "nearest"]:
             raise ValueError(
                 f"`image_resample_strategy` must be one of ['linear', 'nearest'], got {self.image_resample_strategy}."

diff --git a/src/opentau/datasets/factory.py b/src/opentau/datasets/factory.py
@@ -131,6 +131,12 @@ def resolve_delta_timestamps(
     """
     delta_timestamps: dict[str, list[float]] = {}
     action_freq = cfg.dataset_mixture.action_freq
+    # Mixed-frequency training: `action_freq=None` opts out of resampling.
+    # Substituting `ds_meta.fps` makes every delta-timestamp land exactly on
+    # this dataset's native frame boundaries, so nearest-neighbor sampling is
+    # a no-op and consecutive frames are returned unchanged.
+    if action_freq is None:
+        action_freq = ds_meta.fps
 
     if dataset_cfg.repo_id is None:
         raise ValueError("dataset_cfg.repo_id must not be None when resolving delta timestamps.")

diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py
@@ -301,9 +301,28 @@ def shapes(self) -> dict:
         """Shapes for the different features."""
         return {key: tuple(ft["shape"]) for key, ft in self.features.items()}
 
+    @property
+    def fps(self) -> int | None:
+        """Native frame rate of the dataset.
+
+        Returns ``None`` for datasets without a temporal axis (e.g. VQA
+        image-text datasets). Subclasses with real frame-rate metadata
+        (``LeRobotDatasetMetadata``) override this to return the int from
+        ``info["fps"]``. Downstream callers that emit fps as a sample key
+        treat ``None`` as the pad signal so heterogeneous mixtures
+        (VLA + VQA) stay batchable.
+        """
+        return None
+
 
 class VQADatasetMetadata(DatasetMetadata):
-    """Metadata class for vqa datasets (vision-language datasets)."""
+    """Metadata class for vqa datasets (vision-language datasets).
+
+    Inherits ``fps -> None`` from :class:`DatasetMetadata` since VQA
+    samples have no temporal axis. :meth:`BaseDataset._emit_optional_keys`
+    sees this and emits ``fps_is_pad=True`` for VQA samples in a mixture,
+    keeping every batch row schema-aligned with the LeRobot samples.
+    """
 
     pass
 
@@ -698,6 +717,15 @@ def __init__(self, cfg: TrainPipelineConfig):
         self.response_drop_prob = dm.response_drop_prob if dm else 0.0
         self.metadata_drop_all_prob = dm.metadata_drop_all_prob if dm else 0.0
         self.metadata_drop_each_prob = dm.metadata_drop_each_prob if dm else 0.0
+        # `_emit_optional_keys` emits the *effective* per-sample fps when this
+        # is True: the mixture's `action_freq` if set (every dataset has been
+        # resampled to that rate via `resolve_delta_timestamps`), else the
+        # dataset's native `meta.fps`. Independent of the dropout rolls above.
+        # Default-False fallback matches `DatasetMixtureConfig.emit_fps` so
+        # the "no mixture config" path (VQA-only / unit tests) behaves the
+        # same as the explicit-mixture default.
+        self.emit_fps = dm.emit_fps if dm else False
+        self._action_freq = dm.action_freq if dm else None
         # Whether the above drop rolls actually fire. `make_dataset` flips this
         # off on the validation subset (unless `val_enable_optional_key_dropout`
         # is set). Subgoal *frame* selection (end-of-segment vs. uniform window)
@@ -875,6 +903,20 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None:
         pad signal — a consumer seeing ``""`` can assume the field was
         unavailable or was masked this step.
 
+        ``fps`` (the *effective* per-sample frame rate — ``action_freq`` if
+        set on the mixture, else the dataset's native ``meta.fps``) is
+        emitted as a ``torch.long`` scalar alongside ``fps_is_pad`` when
+        ``self.emit_fps`` is True. Unlike the other metadata fields,
+        ``fps`` does **not** participate in the dropout rolls — it's an
+        intrinsic property of the (possibly resampled) chunk, not a noisy
+        label — so it's always non-padded for samples that have a real
+        frame rate. Samples from VQA datasets (where
+        :attr:`DatasetMetadata.fps` is ``None``) emit
+        ``fps=0, fps_is_pad=True`` so a heterogeneous VLA + VQA mixture
+        stays schema-aligned across the batch. Set ``emit_fps=False`` on
+        the mixture to omit the keys entirely (the policy's
+        ``prepare_metadata`` falls through to its default-pad path).
+
         Dropout order:
             1. ``history_state_drop_prob``: zero ``state`` and historical camera
                frames; mark ``obs_history_is_pad`` all True.
@@ -885,6 +927,7 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None:
             4. ``metadata_drop_all_prob``: mask {speed, mistake, quality,
                robot_type, control_mode} together. If this didn't fire,
                ``metadata_drop_each_prob`` rolls independently for each field.
+               ``fps`` is **not** included in these rolls.
 
         Dropout rolls use the default torch RNG (auto-seeded per worker).
 
@@ -975,6 +1018,27 @@ def _roll(prob: float) -> bool:
             drop_this = drop_meta_all or _roll(self.metadata_drop_each_prob)
             standard_item[key] = "" if drop_this else val
 
+        # (6) Effective fps: the rate the action chunk actually runs at after
+        # any mixture-level resampling. When `action_freq` is set, every
+        # dataset's chunks are nearest-neighbor resampled to that rate by
+        # `resolve_delta_timestamps`, so tokenizing the dataset's native
+        # `meta.fps` would mislead the policy (the chunk timing differs).
+        # When `action_freq is None` (no resampling), the chunk runs at the
+        # dataset's native rate. Datasets without a temporal axis (VQA;
+        # `DatasetMetadata.fps` returns ``None``) emit pad so heterogeneous
+        # VLA + VQA mixtures stay schema-aligned across the batch. Always
+        # non-pad (no dropout) for real-rate samples when emit_fps is
+        # enabled on the mixture.
+        if self.emit_fps:
+            native_fps = self.meta.fps
+            if native_fps is None:
+                standard_item["fps"] = torch.tensor(0, dtype=torch.long)
+                standard_item["fps_is_pad"] = torch.tensor(True)
+            else:
+                effective_fps = int(self._action_freq) if self._action_freq is not None else int(native_fps)
+                standard_item["fps"] = torch.tensor(effective_fps, dtype=torch.long)
+                standard_item["fps_is_pad"] = torch.tensor(False)
+
     def resize_with_pad(self, img, width, height, pad_value=0) -> torch.Tensor:
         """Resize an image to target dimensions with padding.
 

diff --git a/src/opentau/envs/configs.py b/src/opentau/envs/configs.py
@@ -76,13 +76,21 @@ class EnvMetadataConfig:
             ``None``.
         control_mode: ``"joint"`` (joint-position control) or ``"ee"``
             (end-effector control), or ``None``.
+        emit_fps: Whether to broadcast :attr:`EnvConfig.fps` as the ``fps``
+            metadata field at inference (paralleling
+            :attr:`DatasetMixtureConfig.emit_fps` at training time).
+            Defaults to ``False`` — fps conditioning is opt-in so old
+            checkpoints resume cleanly (no surprise ``FPS:`` segment in
+            the policy's metadata prefix). Flip to ``True`` for
+            checkpoints trained with the training-side ``emit_fps=True``.
     """
 
     speed: int | None = None
     quality: int | None = None
     mistake: bool | None = None
     robot_type: str | None = None
     control_mode: ControlMode | None = None
+    emit_fps: bool = False
 
     def __post_init__(self) -> None:
         # `isinstance(x, bool)` guards exclude Python bools — `bool` is a