From 718de1dd5ff9e6c8880641bdb5fc0d727d61e465 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 22 May 2026 19:17:10 -0700 Subject: [PATCH 1/7] feat(datasets): mixed-frequency training + fps metadata for pi07 - DatasetMixtureConfig.action_freq is now Optional[float] and defaults to None, disabling resampling so each dataset is sampled at its native fps. resolve_delta_timestamps substitutes ds_meta.fps when action_freq is None so a chunk of N actions is N consecutive native frames regardless of the dataset's rate. - New DatasetMixtureConfig.emit_fps (default True) makes __getitem__ emit the dataset's native meta.fps as a torch.long sample key (always non-pad, no participation in metadata_drop_*_prob). - All four pi07 / pi07_paligemma prepare_metadata implementations now tokenize fps with the lowercase "fps: N, " header between Robot: and Control:, omitted entirely when fps_is_pad is True. - New EnvMetadataConfig.emit_fps (default True) makes add_eval_metadata broadcast cfg.env.fps as the inference-time fps metadata key, matching the training contract end-to-end. --- src/opentau/configs/default.py | 37 ++++- src/opentau/datasets/factory.py | 6 + src/opentau/datasets/lerobot_dataset.py | 20 +++ src/opentau/envs/configs.py | 7 + src/opentau/envs/utils.py | 18 ++- .../modeling_pi07_high_level.py | 19 ++- .../pi07/low_level/modeling_pi07_low_level.py | 19 ++- .../modeling_pi07_high_level.py | 14 +- .../low_level/modeling_pi07_low_level.py | 62 ++++++-- tests/configs/test_default.py | 35 ++++- tests/datasets/test_delta_timestamps.py | 76 ++++++++++ tests/datasets/test_optional_keys.py | 84 ++++++++++- tests/envs/test_configs.py | 17 +++ tests/envs/test_utils.py | 91 ++++++++++- tests/policies/test_pi07_cpu.py | 142 ++++++++++++++++++ .../test_pi07_paligemma_high_level_planner.py | 100 ++++++++++++ .../policies/test_pi07_paligemma_low_level.py | 127 ++++++++++++++++ 17 files changed, 828 insertions(+), 46 deletions(-) diff --git a/src/opentau/configs/default.py b/src/opentau/configs/default.py index d835f8d9..19b723ae 100644 --- a/src/opentau/configs/default.py +++ b/src/opentau/configs/default.py @@ -174,7 +174,14 @@ class DatasetMixtureConfig: same length as `datasets` when provided. If None, weights are inferred from dataset lengths. Defaults to None. action_freq: Frequency at which actions from the dataset mixture are - resampled, in Hz. Defaults to 30.0. + resampled, in Hz. ``None`` (default) disables resampling — each + dataset is sampled at its native fps, so a single batch can mix + samples from sources running at different rates (predicting + ``chunk_size`` consecutive native frames per sample). Set a + positive float to resample every dataset in the mixture to that + common rate via nearest-neighbor frame selection. When using + ``None``, prefer also setting ``emit_fps=True`` so the policy + can condition on the per-sample rate. image_resample_strategy: Resample strategy for image features. Must be one of 'linear' or 'nearest'. Defaults to 'nearest'. vector_resample_strategy: Resample strategy for non-image features, such @@ -227,6 +234,15 @@ class DatasetMixtureConfig: must have a non-empty ``control_mode`` after the optional ``DatasetConfig.control_mode`` override has been applied. Defaults to ``False`` (empty / missing values are allowed). + emit_fps: Whether ``__getitem__`` returns the dataset's native + ``meta.fps`` as the ``fps`` metadata key (``torch.long`` scalar, + paired with ``fps_is_pad=False``). Default ``True``. When + ``False``, neither ``fps`` nor ``fps_is_pad`` appears in the + sample dict and the pi07 / pi07_paligemma policy prefix omits + the ``fps:`` segment. Unlike the other metadata fields, ``fps`` + is **not** rolled by ``metadata_drop_*_prob`` — it's an intrinsic + property of the dataset, not a noisy label, so it is always + present when ``emit_fps=True``. tolerance_s: Mixture-wide default tolerance (in seconds) for the load-time ``check_timestamps_sync`` call inside ``LeRobotDataset.__init__``. Each dataset's frame-to-frame @@ -254,9 +270,9 @@ class DatasetMixtureConfig: Raises: ValueError: If `weights` is provided and its length doesn't match - `datasets`, if `action_freq` is not positive, if resample - strategies are invalid, or if any drop probability is outside - ``[0, 1]``. + `datasets`, if `action_freq` is not None and not positive, if + resample strategies are invalid, or if any drop probability is + outside ``[0, 1]``. """ # List of dataset configs to be used in the mixture. @@ -265,7 +281,8 @@ class DatasetMixtureConfig: # Must be the same length as `datasets` when provided. weights: list[float] | None = None # Frequency at which the actions from dataset mixture are resampled, in Hz. - action_freq: float = 30.0 + # ``None`` disables resampling — each dataset is sampled at its native fps. + action_freq: float | None = None # Resample strategy for image features image_resample_strategy: str = "nearest" # Resample strategy for non-image features, such as action or state @@ -297,6 +314,12 @@ class DatasetMixtureConfig: require_non_empty_robot_type: bool = False require_non_empty_control_mode: bool = False + # Whether `__getitem__` emits the dataset's native fps as the `fps` + # metadata key. Independent of `metadata_drop_*_prob` — fps is intrinsic + # to the dataset, not a noisy label, so it is always present (never + # padded) when this is True. + emit_fps: bool = True + # Mixture-wide defaults for the load-time timestamp-sync check. Each # dataset can override these via `DatasetConfig.{tolerance_s, # skip_timestamp_check}`. The default tolerance matches @@ -308,8 +331,8 @@ def __post_init__(self): """Validate dataset mixture configuration.""" if self.weights is not None and len(self.datasets) != len(self.weights): raise ValueError("The length of `weights` must match the length of `datasets`.") - if self.action_freq <= 0: - raise ValueError(f"`action_freq` must be a positive number, got {self.action_freq}.") + if self.action_freq is not None and self.action_freq <= 0: + raise ValueError(f"`action_freq` must be a positive number or None, got {self.action_freq}.") if self.image_resample_strategy not in ["linear", "nearest"]: raise ValueError( f"`image_resample_strategy` must be one of ['linear', 'nearest'], got {self.image_resample_strategy}." diff --git a/src/opentau/datasets/factory.py b/src/opentau/datasets/factory.py index 3cfbddf4..33d42583 100644 --- a/src/opentau/datasets/factory.py +++ b/src/opentau/datasets/factory.py @@ -131,6 +131,12 @@ def resolve_delta_timestamps( """ delta_timestamps: dict[str, list[float]] = {} action_freq = cfg.dataset_mixture.action_freq + # Mixed-frequency training: `action_freq=None` opts out of resampling. + # Substituting `ds_meta.fps` makes every delta-timestamp land exactly on + # this dataset's native frame boundaries, so nearest-neighbor sampling is + # a no-op and consecutive frames are returned unchanged. + if action_freq is None: + action_freq = ds_meta.fps if dataset_cfg.repo_id is None: raise ValueError("dataset_cfg.repo_id must not be None when resolving delta timestamps.") diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py index 2abf6c14..9b3303ae 100644 --- a/src/opentau/datasets/lerobot_dataset.py +++ b/src/opentau/datasets/lerobot_dataset.py @@ -698,6 +698,10 @@ def __init__(self, cfg: TrainPipelineConfig): self.response_drop_prob = dm.response_drop_prob if dm else 0.0 self.metadata_drop_all_prob = dm.metadata_drop_all_prob if dm else 0.0 self.metadata_drop_each_prob = dm.metadata_drop_each_prob if dm else 0.0 + # Whether `_emit_optional_keys` injects the dataset's native `meta.fps` + # under the `fps` / `fps_is_pad` keys. Independent of the dropout rolls + # above — fps is intrinsic to the dataset, not a noisy label. + self.emit_fps = dm.emit_fps if dm else True # Whether the above drop rolls actually fire. `make_dataset` flips this # off on the validation subset (unless `val_enable_optional_key_dropout` # is set). Subgoal *frame* selection (end-of-segment vs. uniform window) @@ -875,6 +879,15 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None: pad signal — a consumer seeing ``""`` can assume the field was unavailable or was masked this step. + ``fps`` (the dataset's native frame rate) is emitted as a + ``torch.long`` scalar alongside ``fps_is_pad`` when + ``self.emit_fps`` is True. Unlike the other metadata fields, ``fps`` + does **not** participate in the dropout rolls — it's an intrinsic + property of the dataset, not a noisy label — so it's always + non-padded when emitted. Set ``emit_fps=False`` on the mixture to + omit the keys entirely (the policy's ``prepare_metadata`` falls + through to its default-pad path). + Dropout order: 1. ``history_state_drop_prob``: zero ``state`` and historical camera frames; mark ``obs_history_is_pad`` all True. @@ -885,6 +898,7 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None: 4. ``metadata_drop_all_prob``: mask {speed, mistake, quality, robot_type, control_mode} together. If this didn't fire, ``metadata_drop_each_prob`` rolls independently for each field. + ``fps`` is **not** included in these rolls. Dropout rolls use the default torch RNG (auto-seeded per worker). @@ -975,6 +989,12 @@ def _roll(prob: float) -> bool: drop_this = drop_meta_all or _roll(self.metadata_drop_each_prob) standard_item[key] = "" if drop_this else val + # (6) Native fps: intrinsic dataset property, not a noisy label. Emitted + # unconditionally (no dropout) when `emit_fps` is enabled on the mixture. + if self.emit_fps: + standard_item["fps"] = torch.tensor(int(self.meta.fps), dtype=torch.long) + standard_item["fps_is_pad"] = torch.tensor(False) + def resize_with_pad(self, img, width, height, pad_value=0) -> torch.Tensor: """Resize an image to target dimensions with padding. diff --git a/src/opentau/envs/configs.py b/src/opentau/envs/configs.py index b47b246c..57bb09cd 100644 --- a/src/opentau/envs/configs.py +++ b/src/opentau/envs/configs.py @@ -76,6 +76,12 @@ class EnvMetadataConfig: ``None``. control_mode: ``"joint"`` (joint-position control) or ``"ee"`` (end-effector control), or ``None``. + emit_fps: Whether to broadcast :attr:`EnvConfig.fps` as the ``fps`` + metadata field at inference (paralleling + :attr:`DatasetMixtureConfig.emit_fps` at training time). + Defaults to ``True``. When ``False``, the policy prefix omits + the ``fps:`` segment — useful when resuming a checkpoint + trained without fps conditioning. """ speed: int | None = None @@ -83,6 +89,7 @@ class EnvMetadataConfig: mistake: bool | None = None robot_type: str | None = None control_mode: ControlMode | None = None + emit_fps: bool = True def __post_init__(self) -> None: # `isinstance(x, bool)` guards exclude Python bools — `bool` is a diff --git a/src/opentau/envs/utils.py b/src/opentau/envs/utils.py index 387c2187..2af6c749 100644 --- a/src/opentau/envs/utils.py +++ b/src/opentau/envs/utils.py @@ -172,13 +172,23 @@ def add_eval_metadata(observation: dict[str, Any], cfg: TrainPipelineConfig) -> broadcast as ``list[str]`` of length ``num_envs``. Fields set to ``None`` on the config are skipped, so the corresponding batch key stays absent and the policy's ``prepare_metadata`` pad default kicks - in. ``cfg.env`` is guaranteed non-``None`` here — ``eval_policy`` + in. + + ``fps`` is the env's stepping frequency (:attr:`EnvConfig.fps`, + e.g. ``20`` for LIBERO) and is broadcast as a ``torch.long`` tensor + with ``fps_is_pad=False`` whenever ``meta.emit_fps`` is ``True`` + (the default). Set ``emit_fps=False`` on ``EnvMetadataConfig`` to + skip it — useful when resuming a checkpoint trained without fps + conditioning. + + ``cfg.env`` is guaranteed non-``None`` here — ``eval_policy`` dereferences ``cfg.env.type`` before the rollout loop starts. Args: observation: Observation dict produced by ``preprocess_observation`` (must contain ``state`` to source the batch size and device). - cfg: Training/eval pipeline config; reads ``cfg.env.metadata``. + cfg: Training/eval pipeline config; reads ``cfg.env.metadata`` and + ``cfg.env.fps``. Returns: The observation dict, mutated in place. @@ -203,6 +213,10 @@ def add_eval_metadata(observation: dict[str, Any], cfg: TrainPipelineConfig) -> if val is not None: observation[key] = [val] * batch_size + if meta.emit_fps: + observation["fps"] = torch.tensor([cfg.env.fps] * batch_size, dtype=torch.long, device=device) + observation["fps_is_pad"] = torch.zeros(batch_size, dtype=torch.bool, device=device) + return observation diff --git a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py index b15eee19..d320bce6 100644 --- a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py +++ b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py @@ -688,12 +688,12 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: Args: batch: Batch dict that may contain any of: - ``"speed"``, ``"quality"``, ``"mistake"`` (float tensors with - a corresponding ``_is_pad`` bool tensor — entries marked as - pad are dropped), and ``"robot_type"``, ``"control_mode"`` - (lists of strings — empty string is the pad signal, no - separate ``_is_pad`` flag). Missing keys are treated as - fully padded. + ``"speed"``, ``"quality"``, ``"mistake"``, ``"fps"`` (numeric + tensors with a corresponding ``_is_pad`` bool tensor — + entries marked as pad are dropped), and ``"robot_type"``, + ``"control_mode"`` (lists of strings — empty string is the + pad signal, no separate ``_is_pad`` flag). Missing keys are + treated as fully padded. Returns: A tuple ``(metadata_tokens, metadata_masks)`` with shapes @@ -711,6 +711,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: mistake_is_pad, robot_type, control_mode, + fps, + fps_is_pad, ) in zip( batch.get("speed", torch.zeros(batch_size, dtype=torch.float32)), batch.get("quality", torch.zeros(batch_size, dtype=torch.float32)), @@ -720,6 +722,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: batch.get("mistake_is_pad", torch.ones(batch_size, dtype=torch.bool)), batch.get("robot_type", [""] * batch_size), batch.get("control_mode", [""] * batch_size), + batch.get("fps", torch.zeros(batch_size, dtype=torch.long)), + batch.get("fps_is_pad", torch.ones(batch_size, dtype=torch.bool)), strict=True, ): segments = [] @@ -735,6 +739,9 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: if robot_type: segments.append(f"Robot: {robot_type}, ") + if not fps_is_pad: + segments.append(f"fps: {str(fps.item())}, ") + if control_mode: segments.append(f"Control: {control_mode}, ") diff --git a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py index 58bbc2dd..6ca13d3c 100644 --- a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py +++ b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py @@ -1087,12 +1087,12 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: Args: batch: Batch dict that may contain any of: - ``"speed"``, ``"quality"``, ``"mistake"`` (float tensors with - a corresponding ``_is_pad`` bool tensor — entries marked as - pad are dropped), and ``"robot_type"``, ``"control_mode"`` - (lists of strings — empty string is the pad signal, no - separate ``_is_pad`` flag). Missing keys are treated as - fully padded. + ``"speed"``, ``"quality"``, ``"mistake"``, ``"fps"`` (numeric + tensors with a corresponding ``_is_pad`` bool tensor — + entries marked as pad are dropped), and ``"robot_type"``, + ``"control_mode"`` (lists of strings — empty string is the + pad signal, no separate ``_is_pad`` flag). Missing keys are + treated as fully padded. Returns: A tuple ``(metadata_tokens, metadata_masks)`` with shapes @@ -1111,6 +1111,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: mistake_is_pad, robot_type, control_mode, + fps, + fps_is_pad, ) in zip( batch.get("speed", torch.zeros(batch_size, dtype=torch.float32)), batch.get("quality", torch.zeros(batch_size, dtype=torch.float32)), @@ -1120,6 +1122,8 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: batch.get("mistake_is_pad", torch.ones(batch_size, dtype=torch.bool)), batch.get("robot_type", [""] * batch_size), batch.get("control_mode", [""] * batch_size), + batch.get("fps", torch.zeros(batch_size, dtype=torch.long)), + batch.get("fps_is_pad", torch.ones(batch_size, dtype=torch.bool)), strict=True, ): segments = [] @@ -1135,6 +1139,9 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: if robot_type: segments.append(f"Robot: {robot_type}, ") + if not fps_is_pad: + segments.append(f"fps: {str(fps.item())}, ") + if control_mode: segments.append(f"Control: {control_mode}, ") diff --git a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py index 7164794b..686e4f38 100644 --- a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py +++ b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py @@ -665,17 +665,24 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: """Tokenizes the metadata for training. Wraps each metadata string with an ```` suffix, then tokenizes and - pads to ``metadata_max_length``. + pads to ``metadata_max_length``. Numeric fields (``speed`` / ``quality`` + / ``mistake`` / ``fps``) are paired with ``_is_pad`` flags — pad-True + entries are omitted from the segment string. ``fps`` is the dataset's + native frame rate (``torch.long``); when absent or padded the + ``fps:`` segment is dropped (no header, no comma). """ metadata = [] - for speed, quality, mistake, speed_is_pad, quality_is_pad, mistake_is_pad in zip( + batch_size = batch["state"].shape[0] + for speed, quality, mistake, speed_is_pad, quality_is_pad, mistake_is_pad, fps, fps_is_pad in zip( batch["speed"], batch["quality"], batch["mistake"], batch["speed_is_pad"], batch["quality_is_pad"], batch["mistake_is_pad"], + batch.get("fps", torch.zeros(batch_size, dtype=torch.long)), + batch.get("fps_is_pad", torch.ones(batch_size, dtype=torch.bool)), strict=True, ): segments = [] @@ -688,6 +695,9 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: if not mistake_is_pad: segments.append(f"Mistake: {str(mistake.item())}, ") + if not fps_is_pad: + segments.append(f"fps: {str(fps.item())}, ") + metadata.append(f"Metadata: {' '.join(segments)}") device = batch["state"].device diff --git a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py index 44fa9600..831585a6 100644 --- a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py +++ b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py @@ -1305,6 +1305,12 @@ def _hydrate_metadata_batch(self, batch: dict) -> None: specified explicitly. - ``robot_type`` / ``control_mode`` are string lists (empty string is the pad signal, no separate ``_is_pad`` flag) and default to ``[""]``. + - ``fps`` (the dataset's native frame rate, ``torch.long``) defaults + to zeros and ``fps_is_pad`` defaults to ``True`` when the keys + are absent. The dataloader emits ``fps`` unconditionally (no + dropout) when ``DatasetMixtureConfig.emit_fps=True``; inference + batches built by ``add_eval_metadata`` set it when + ``EnvMetadataConfig.emit_fps=True``. Per-sample on/off is controlled only by ``*_is_pad`` (or the empty string for ``robot_type`` / ``control_mode``) after hydration; varying @@ -1328,6 +1334,10 @@ def _hydrate_metadata_batch(self, batch: dict) -> None: batch["robot_type"] = [""] * bsz if "control_mode" not in batch: batch["control_mode"] = [""] * bsz + if "fps" not in batch: + batch["fps"] = torch.zeros(bsz, dtype=torch.long, device=dev) + if "fps_is_pad" not in batch: + batch["fps_is_pad"] = torch.ones(bsz, dtype=torch.bool, device=dev) def _hydrate_optional_conditioning_batch(self, batch: dict) -> None: """Ensure response / metadata keys exist (training + inference parity). @@ -1375,11 +1385,16 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: """Tokenize episode metadata into PaliGemma token IDs. Builds strings ``Metadata: Speed: … Quality: … Mistake: … Robot: … - Control: …`` only from fields whose ``*_is_pad`` flag is ``False`` - (``robot_type`` / ``control_mode`` are strings, omitted when empty). - For each sample, a padded or empty field is omitted entirely (not - concatenated to ``segments``). When every field is pad, the row is - ``""`` before tokenization. + fps: … Control: …`` only from fields whose ``*_is_pad`` flag is + ``False`` (``robot_type`` / ``control_mode`` are strings, omitted + when empty). For each sample, a padded or empty field is omitted + entirely (not concatenated to ``segments``). When every field is + pad, the row is ``""`` before tokenization. + + ``fps`` is the dataset's native frame rate (``torch.long``); the + segment is omitted when ``fps_is_pad`` is True. Segment ordering is + ``Speed → Quality → Mistake → Robot → fps → Control`` to match the + other pi07 / pi07_paligemma ``prepare_metadata`` implementations. Always runs :meth:`_hydrate_metadata_batch` first so callers see the same outcomes whether they hit :meth:`sample_actions` (often no metadata @@ -1390,14 +1405,15 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: scalar tensors broadcast like ``subgoal_is_pad``. .. note:: - ``speed_is_pad`` / ``quality_is_pad`` / ``mistake_is_pad`` each - default to ``torch.ones(B, dtype=bool)`` (treat-as-pad) when the - key is missing from ``batch``. This is a behavior change from - the previous treat-as-real (``torch.zeros``) default. Hand-built - inference batches that supply ``"speed"`` / ``"quality"`` / - ``"mistake"`` but omit the corresponding ``_is_pad`` flag will - now have those metadata fields silently dropped from the prefix - string — pass ``..._is_pad=torch.zeros(...)`` explicitly when the + ``speed_is_pad`` / ``quality_is_pad`` / ``mistake_is_pad`` / + ``fps_is_pad`` each default to ``torch.ones(B, dtype=bool)`` + (treat-as-pad) when the key is missing from ``batch``. This is + a behavior change from the previous treat-as-real + (``torch.zeros``) default. Hand-built inference batches that + supply ``"speed"`` / ``"quality"`` / ``"mistake"`` / ``"fps"`` + but omit the corresponding ``_is_pad`` flag will now have + those metadata fields silently dropped from the prefix string + — pass ``..._is_pad=torch.zeros(...)`` explicitly when the metadata is real. Args: @@ -1423,6 +1439,18 @@ def _row_float(key: str) -> Tensor: ) return t + def _row_long(key: str) -> Tensor: + """``(B,)`` long-int row (``fps``), scalar-broadcastable.""" + t = batch[key] + t = torch.as_tensor(t, dtype=torch.long, device=device).reshape(-1) + if t.numel() == 1: + t = t.expand(b) + elif t.shape[0] != b: + raise ValueError( + f"{key} must have shape ({b},) or be scalar broadcastable; got {tuple(t.shape)}." + ) + return t + def _row_bool(key: str) -> Tensor: """``(B,)`` boolean row (``mistake`` or ``*_is_pad``), scalar-broadcastable.""" t = batch[key] @@ -1438,9 +1466,11 @@ def _row_bool(key: str) -> Tensor: speed_t = _row_float("speed") quality_t = _row_float("quality") mistake_t = _row_bool("mistake") + fps_t = _row_long("fps") pad_speed = _row_bool("speed_is_pad") pad_quality = _row_bool("quality_is_pad") pad_mistake = _row_bool("mistake_is_pad") + pad_fps = _row_bool("fps_is_pad") robot_types = batch["robot_type"] control_modes = batch["control_mode"] @@ -1454,6 +1484,8 @@ def _row_bool(key: str) -> Tensor: mistake_is_pad, robot_type, control_mode, + fps, + fps_is_pad, ) in zip( speed_t, quality_t, @@ -1463,6 +1495,8 @@ def _row_bool(key: str) -> Tensor: pad_mistake, robot_types, control_modes, + fps_t, + pad_fps, strict=True, ): segments: list[str] = [] @@ -1477,6 +1511,8 @@ def _row_bool(key: str) -> Tensor: segments.append(f"Mistake: {str(mistake.item())}, ") if robot_type: segments.append(f"Robot: {robot_type}, ") + if not fps_is_pad.item(): + segments.append(f"fps: {str(fps.item())}, ") if control_mode: segments.append(f"Control: {control_mode}, ") metadata_rows.append(f"Metadata: {' '.join(segments)}" if segments else "") diff --git a/tests/configs/test_default.py b/tests/configs/test_default.py index 02e20c54..ef544129 100644 --- a/tests/configs/test_default.py +++ b/tests/configs/test_default.py @@ -74,10 +74,43 @@ def test_invalid_action_freq_raises_error(invalid_freq): """ Tests that a ValueError is raised if action_freq is zero or negative. """ - with pytest.raises(ValueError, match=f"`action_freq` must be a positive number, got {invalid_freq}."): + with pytest.raises( + ValueError, + match=rf"`action_freq` must be a positive number or None, got {invalid_freq}\.", + ): DatasetMixtureConfig(action_freq=invalid_freq) +def test_action_freq_defaults_to_none(): + """`action_freq=None` is the new default — each dataset trains at its native fps.""" + cfg = DatasetMixtureConfig() + assert cfg.action_freq is None + + +def test_action_freq_none_is_valid(): + """Explicit `None` is accepted and disables resampling.""" + cfg = DatasetMixtureConfig(action_freq=None) + assert cfg.action_freq is None + + +def test_action_freq_positive_still_valid(): + """A positive float still pins every dataset to that common rate.""" + cfg = DatasetMixtureConfig(action_freq=30.0) + assert cfg.action_freq == 30.0 + + +def test_emit_fps_defaults_to_true(): + """The new `emit_fps` toggle defaults to True so the policy gets fps conditioning out of the box.""" + cfg = DatasetMixtureConfig() + assert cfg.emit_fps is True + + +def test_emit_fps_can_be_disabled(): + """Setting `emit_fps=False` should be accepted (legacy-checkpoint resume escape hatch).""" + cfg = DatasetMixtureConfig(emit_fps=False) + assert cfg.emit_fps is False + + def test_invalid_image_resample_strategy_raises_error(): """ Tests that a ValueError is raised for an unsupported image_resample_strategy. diff --git a/tests/datasets/test_delta_timestamps.py b/tests/datasets/test_delta_timestamps.py index 142ec790..58af8874 100644 --- a/tests/datasets/test_delta_timestamps.py +++ b/tests/datasets/test_delta_timestamps.py @@ -318,6 +318,82 @@ def test_resolve_delta_timestamps_only_state_feature(train_pipeline_config, data assert dt_mean["state"] == [0.0] +def test_resolve_delta_timestamps_native_fps_substitution( + train_pipeline_config, dataset_config, lerobot_dataset_metadata +): + """`action_freq=None` swaps in the dataset's native fps. + + With chunk-style action indices and a mismatched native fps, the + resulting per-action timestamps must land on native-fps frame + boundaries (``i / ds_meta.fps``) — so nearest-neighbor resampling is + a no-op and consecutive frames are returned unchanged. + """ + # `action_delta_indices` is a property derived from `chunk_size` + # (see PI0Config.action_delta_indices = list(range(chunk_size))), so drive + # the test via `chunk_size` rather than monkey-patching the property. + train_pipeline_config.policy.chunk_size = 4 + train_pipeline_config.dataset_mixture.action_freq = None + + # Inject the action feature into the metadata so it gets picked up. + lerobot_dataset_metadata.features.update({"actions": {"dtype": "float32", "shape": [32]}}) + # Pin native fps to something distinct from the legacy default (30) so the + # substitution is observable in the output. + lerobot_dataset_metadata.fps = 50 + + dt_mean, _, _, _ = resolve_delta_timestamps( + train_pipeline_config, dataset_config, lerobot_dataset_metadata + ) + + # Each delta-timestamp value equals `i / native_fps`. + expected = [i / 50 for i in range(4)] + assert np.allclose(dt_mean["actions"], expected) + + +def test_resolve_delta_timestamps_native_fps_differs_per_dataset( + train_pipeline_config, dataset_config, lerobot_dataset_metadata +): + """Two datasets in the same mixture with different native fps must + produce different timestamps when action_freq is None. + + This is the core invariant of mixed-frequency training: a 30 Hz source + and a 50 Hz source see chunk timestamps scaled to their respective + rates, so nearest-neighbor sampling lands on consecutive native frames + in both cases. + """ + train_pipeline_config.policy.chunk_size = 3 + train_pipeline_config.dataset_mixture.action_freq = None + lerobot_dataset_metadata.features.update({"actions": {"dtype": "float32", "shape": [32]}}) + + lerobot_dataset_metadata.fps = 30 + dt_a, _, _, _ = resolve_delta_timestamps(train_pipeline_config, dataset_config, lerobot_dataset_metadata) + + lerobot_dataset_metadata.fps = 50 + dt_b, _, _, _ = resolve_delta_timestamps(train_pipeline_config, dataset_config, lerobot_dataset_metadata) + + assert np.allclose(dt_a["actions"], [0 / 30, 1 / 30, 2 / 30]) + assert np.allclose(dt_b["actions"], [0 / 50, 1 / 50, 2 / 50]) + # Sanity check: the two are genuinely different (not just numerically + # close by accident). + assert not np.allclose(dt_a["actions"], dt_b["actions"]) + + +def test_resolve_delta_timestamps_action_freq_overrides_native_fps( + train_pipeline_config, dataset_config, lerobot_dataset_metadata +): + """When `action_freq` is set (non-None), the native fps is ignored.""" + train_pipeline_config.policy.chunk_size = 3 + train_pipeline_config.dataset_mixture.action_freq = 30.0 + lerobot_dataset_metadata.features.update({"actions": {"dtype": "float32", "shape": [32]}}) + lerobot_dataset_metadata.fps = 50 # different native fps — must be overridden + + dt_mean, _, _, _ = resolve_delta_timestamps( + train_pipeline_config, dataset_config, lerobot_dataset_metadata + ) + + # Timestamps reflect the configured `action_freq=30`, not native fps=50. + assert np.allclose(dt_mean["actions"], [0 / 30, 1 / 30, 2 / 30]) + + def test_resolve_delta_timestamps_other_features_ignored( train_pipeline_config, dataset_config, lerobot_dataset_metadata ): diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py index c199ccc0..54d67071 100644 --- a/tests/datasets/test_optional_keys.py +++ b/tests/datasets/test_optional_keys.py @@ -64,7 +64,9 @@ def __init__( response_drop_prob=0.0, metadata_drop_all_prob=0.0, metadata_drop_each_prob=0.0, + emit_fps=True, meta_info: dict | None = None, + meta_fps: int = 30, ): torch.utils.data.Dataset.__init__(self) self.resolution = resolution @@ -79,10 +81,16 @@ def __init__( self.response_drop_prob = response_drop_prob self.metadata_drop_all_prob = metadata_drop_all_prob self.metadata_drop_each_prob = metadata_drop_each_prob + self.emit_fps = emit_fps self.enable_optional_key_dropout = True # Stub meta surface so _to_standard_data_format can read robot_type / # control_mode without instantiating a real LeRobotDatasetMetadata. - self.meta = SimpleNamespace(info=meta_info if meta_info is not None else {}) + # `fps` mirrors `LeRobotDatasetMetadata.fps` so `_emit_optional_keys` + # can populate the new `fps` / `fps_is_pad` sample keys. + self.meta = SimpleNamespace( + info=meta_info if meta_info is not None else {}, + fps=meta_fps, + ) def _get_feature_mapping_key(self) -> str: return _TEST_MAPPING_KEY @@ -682,3 +690,77 @@ def test_emitted_as_python_strings_for_default_collate(self): out = ds._to_standard_data_format(_full_raw_item(ds)) assert type(out["robot_type"]) is str assert type(out["control_mode"]) is str + + +# `fps` is emitted as an intrinsic dataset property — always present (non-pad) +# when `emit_fps=True`, omitted entirely when `emit_fps=False`. Unlike speed / +# quality / mistake / robot_type / control_mode it does NOT participate in any +# of the `metadata_drop_*_prob` dropout rolls. + + +class TestFpsEmission: + def test_fps_emitted_with_meta_fps_value(self): + ds = _DummyBaseDataset(num_cams=1, meta_fps=20) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert out["fps"].dtype == torch.long + assert out["fps"].item() == 20 + assert out["fps_is_pad"].item() is False + + def test_fps_emitted_as_torch_long_dtype(self): + ds = _DummyBaseDataset(num_cams=1, meta_fps=50) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert out["fps"].dtype == torch.long + + def test_fps_omitted_when_emit_fps_false(self): + ds = _DummyBaseDataset(num_cams=1, emit_fps=False, meta_fps=20) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert "fps" not in out + assert "fps_is_pad" not in out + + def test_fps_unaffected_by_metadata_drop_all_prob(self): + # fps is intrinsic to the dataset, not a noisy label — it must NOT + # participate in `metadata_drop_*_prob` rolls. + ds = _DummyBaseDataset( + num_cams=1, + meta_fps=15, + metadata_drop_all_prob=1.0, + metadata_drop_each_prob=1.0, + ) + out = ds._to_standard_data_format(_full_raw_item(ds)) + # Other metadata fields are dropped — but fps stays. + assert out["speed_is_pad"].item() is True + assert out["quality_is_pad"].item() is True + assert out["mistake_is_pad"].item() is True + assert out["robot_type"] == "" + assert out["control_mode"] == "" + assert out["fps"].item() == 15 + assert out["fps_is_pad"].item() is False + + def test_fps_unaffected_by_disabled_dropout(self): + # `enable_optional_key_dropout=False` (the val-subset path) must keep + # the same emit behavior — fps is gated by `emit_fps`, not by dropout. + ds = _DummyBaseDataset( + num_cams=1, + meta_fps=25, + metadata_drop_all_prob=1.0, + metadata_drop_each_prob=1.0, + ) + ds.enable_optional_key_dropout = False + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert out["fps"].item() == 25 + assert out["fps_is_pad"].item() is False + + def test_default_collate_stacks_mixed_fps(self): + """Heterogeneous-fps batches (different datasets in a mixture) must + collate cleanly: the per-sample `fps` tensors become a (B,) long + tensor and `fps_is_pad` becomes a (B,) bool tensor.""" + from torch.utils.data import default_collate + + ds_a = _DummyBaseDataset(num_cams=1, meta_fps=30) + ds_b = _DummyBaseDataset(num_cams=1, meta_fps=50) + item_a = ds_a._to_standard_data_format(_full_raw_item(ds_a)) + item_b = ds_b._to_standard_data_format(_full_raw_item(ds_b)) + batch = default_collate([item_a, item_b]) + assert batch["fps"].dtype == torch.long + assert batch["fps"].tolist() == [30, 50] + assert batch["fps_is_pad"].tolist() == [False, False] diff --git a/tests/envs/test_configs.py b/tests/envs/test_configs.py index 3479c4ae..f475abd9 100644 --- a/tests/envs/test_configs.py +++ b/tests/envs/test_configs.py @@ -188,6 +188,23 @@ def test_all_none_default_passes(self): assert cfg.mistake is None assert cfg.robot_type is None assert cfg.control_mode is None + # `emit_fps` is a boolean toggle (not a None-presence field) and + # defaults to True — the policy receives an fps segment at eval + # unless the user explicitly opts out. + assert cfg.emit_fps is True + + def test_emit_fps_can_be_disabled(self): + cfg = EnvMetadataConfig(emit_fps=False) + assert cfg.emit_fps is False + + def test_emit_fps_kept_orthogonal_to_other_fields(self): + """Setting ``emit_fps`` must not perturb the other fields' defaults.""" + cfg = EnvMetadataConfig(emit_fps=False) + assert cfg.speed is None + assert cfg.quality is None + assert cfg.mistake is None + assert cfg.robot_type is None + assert cfg.control_mode is None @pytest.mark.parametrize("mode", ["joint", "ee"]) @pytest.mark.parametrize("speed", [0, 10, 50, 90, 100]) diff --git a/tests/envs/test_utils.py b/tests/envs/test_utils.py index fdbd6e9e..ebb865a8 100644 --- a/tests/envs/test_utils.py +++ b/tests/envs/test_utils.py @@ -208,13 +208,21 @@ def _make_obs(batch_size: int = 2, device: str = "cpu") -> dict: return {"state": torch.zeros(batch_size, 8, device=device, dtype=torch.float32)} -def _make_cfg(**metadata_kwargs) -> SimpleNamespace: - """Build a duck-typed ``cfg`` exposing only ``cfg.env.metadata``. +def _make_cfg(*, fps: int = 30, **metadata_kwargs) -> SimpleNamespace: + """Build a duck-typed ``cfg`` exposing ``cfg.env.metadata`` and + ``cfg.env.fps``. Avoids constructing a full ``TrainPipelineConfig`` (which pulls in - optimizer/dataset/policy validation) just to read five attributes. + optimizer/dataset/policy validation) just to read a handful of attributes. + ``fps`` defaults to 30 (matches ``EnvConfig.fps``) so the + ``emit_fps=True`` default path produces a deterministic value. """ - return SimpleNamespace(env=SimpleNamespace(metadata=EnvMetadataConfig(**metadata_kwargs))) + return SimpleNamespace( + env=SimpleNamespace( + metadata=EnvMetadataConfig(**metadata_kwargs), + fps=fps, + ) + ) class TestAddEvalMetadata: @@ -224,13 +232,31 @@ class TestAddEvalMetadata: silently changes the prefix the model sees at eval. """ - def test_all_none_default_skips_every_key(self): + def test_all_none_default_skips_user_keys_but_still_emits_fps(self): + """``EnvMetadataConfig`` defaults: every user-controlled field is + ``None`` *but* ``emit_fps`` defaults to ``True``, so the helper still + injects ``fps`` / ``fps_is_pad`` from ``cfg.env.fps``. + """ obs = _make_obs() original_keys = set(obs.keys()) out = add_eval_metadata(obs, cfg=_make_cfg()) assert out is obs # mutated-in-place contract + new_keys = set(out.keys()) - original_keys + assert new_keys == {"fps", "fps_is_pad"}, ( + f"unexpected injected keys: {new_keys}; only fps/fps_is_pad should appear " + f"under the all-None metadata default" + ) + + def test_emit_fps_false_skips_fps(self): + """``emit_fps=False`` on ``EnvMetadataConfig`` reverts the helper to the + legacy "skip all when nothing set" behaviour — useful when resuming a + checkpoint trained without fps conditioning. + """ + obs = _make_obs() + original_keys = set(obs.keys()) + out = add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False)) assert set(out.keys()) == original_keys, ( - "no metadata keys should be injected when every field is None" + "no keys should be injected when emit_fps=False and every metadata field is None" ) @pytest.mark.parametrize("batch_size", [1, 4]) @@ -291,12 +317,22 @@ def test_string_fields_broadcast_as_list(self, key, value): assert f"{key}_is_pad" not in obs, "string fields use empty-string as pad signal, not a flag" def test_partial_fields_only_inject_specified_keys(self): + # `emit_fps=False` so this test isolates the user-set fields from the + # default fps injection (covered separately in test_emit_fps_*). obs = _make_obs() - add_eval_metadata(obs, cfg=_make_cfg(speed=30, robot_type="UR5")) + add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5")) assert "speed" in obs and "speed_is_pad" in obs assert "robot_type" in obs - for absent in ("quality", "quality_is_pad", "mistake", "mistake_is_pad", "control_mode"): + for absent in ( + "quality", + "quality_is_pad", + "mistake", + "mistake_is_pad", + "control_mode", + "fps", + "fps_is_pad", + ): assert absent not in obs, f"unset field {absent} should not be injected" def test_device_propagation(self): @@ -313,3 +349,42 @@ def test_device_propagation(self): assert obs["speed_is_pad"].device.type == "meta" assert obs["mistake"].device.type == "meta" assert obs["mistake_is_pad"].device.type == "meta" + # fps gets injected too under the default `emit_fps=True`; it must + # share the same device as state. + assert obs["fps"].device.type == "meta" + assert obs["fps_is_pad"].device.type == "meta" + + @pytest.mark.parametrize("batch_size", [1, 4]) + def test_fps_injects_long_tensor_from_env_fps(self, batch_size): + """``emit_fps=True`` (default) → broadcast ``cfg.env.fps`` as a + ``(B,)`` torch.long tensor with ``fps_is_pad=False`` rows.""" + obs = _make_obs(batch_size=batch_size) + add_eval_metadata(obs, cfg=_make_cfg(fps=20)) # LiberoEnv-style fps + + assert obs["fps"].dtype == torch.long + assert obs["fps"].shape == (batch_size,) + assert torch.equal(obs["fps"], torch.full((batch_size,), 20, dtype=torch.long)) + + assert obs["fps_is_pad"].dtype == torch.bool + assert obs["fps_is_pad"].shape == (batch_size,) + assert not obs["fps_is_pad"].any() + + def test_fps_value_comes_from_env_not_metadata(self): + """The fps value source is ``cfg.env.fps`` (env stepping freq), NOT a + field on ``EnvMetadataConfig`` (which only carries the on/off toggle).""" + obs = _make_obs(batch_size=2) + add_eval_metadata(obs, cfg=_make_cfg(fps=50)) + + assert torch.equal(obs["fps"], torch.full((2,), 50, dtype=torch.long)) + + def test_fps_disabled_skips_only_fps_keys(self): + """``emit_fps=False`` skips the fps injection but leaves other set + fields alone — they go through the regular None-presence gating.""" + obs = _make_obs(batch_size=2) + add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5")) + + assert "fps" not in obs + assert "fps_is_pad" not in obs + # Existing user-set fields still flow. + assert "speed" in obs + assert obs["robot_type"] == ["UR5", "UR5"] diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py index 88e254d7..59fde78d 100644 --- a/tests/policies/test_pi07_cpu.py +++ b/tests/policies/test_pi07_cpu.py @@ -542,6 +542,148 @@ def test_low_level_missing_speed_pad_does_not_fabricate_value(self): assert "Mistake:" not in line +class TestPrepareMetadataFps: + """Verify the ``fps`` segment construction in the pi07 planners. + + ``fps`` is the dataset's native frame rate. The planner tokenizes it + with the lowercase ``"fps: N, "`` header, positioned between + ``Robot:`` and ``Control:`` for consistency with the + ``Speed → Quality → Mistake → Robot → fps → Control`` ordering pinned + in the segment-assembly loop of all four pi07 / pi07_paligemma + modelling files. The segment is omitted entirely (no header, no + comma) when ``fps_is_pad`` is ``True`` or when the ``fps`` key is + absent from the batch. + """ + + @staticmethod + def _planner_methods(): + from opentau.policies.pi07.high_level_planner.modeling_pi07_high_level import ( + PI07HighLevelPlannerPolicy, + ) + from opentau.policies.pi07.low_level.modeling_pi07_low_level import ( + PI07LowLevelPolicy, + ) + + return { + "low": PI07LowLevelPolicy.prepare_metadata, + "high": PI07HighLevelPlannerPolicy.prepare_metadata, + } + + @pytest.mark.parametrize("planner", ["low", "high"]) + def test_fps_present_emits_lowercase_segment(self, planner): + """``fps=30`` non-pad → ``"fps: 30, "`` appears in the prefix.""" + method = self._planner_methods()[planner] + fake, captured = _make_fake_planner() + + batch_size = 2 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([30, 50], dtype=torch.long), + "fps_is_pad": torch.tensor([False, False]), + } + + method(fake, batch) + + assert len(captured) == batch_size + for line, fps in zip(captured, [30, 50], strict=True): + assert line.startswith("Metadata: ") + assert f"fps: {fps}, " in line + # Lowercase per spec — must NOT be "Fps:" or "FPS:". + assert "Fps:" not in line + assert "FPS:" not in line + + @pytest.mark.parametrize("planner", ["low", "high"]) + def test_fps_padded_omits_segment(self, planner): + """``fps_is_pad=True`` → no ``"fps:"`` substring, no stray comma.""" + method = self._planner_methods()[planner] + fake, captured = _make_fake_planner() + + batch_size = 2 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([30, 30], dtype=torch.long), + "fps_is_pad": torch.tensor([True, True]), + "robot_type": ["franka", "ur5"], + } + + method(fake, batch) + + for line in captured: + assert "fps:" not in line + # No stray double commas where the fps segment would have lived. + assert ", ," not in line + # Robot segment still present (unaffected by fps padding). + assert "Robot: " in line + + @pytest.mark.parametrize("planner", ["low", "high"]) + def test_fps_key_absent_omits_segment(self, planner): + """Missing ``fps`` key entirely → batch.get default-pad path drops it.""" + method = self._planner_methods()[planner] + fake, captured = _make_fake_planner() + + batch_size = 3 + batch = { + "state": torch.zeros(batch_size, 1), + "robot_type": ["franka", "franka", "franka"], + } + + method(fake, batch) + + for line in captured: + assert "fps:" not in line + + @pytest.mark.parametrize("planner", ["low", "high"]) + def test_fps_slots_between_robot_and_control(self, planner): + """The new fps segment sits between Robot: and Control: in the prefix string.""" + method = self._planner_methods()[planner] + fake, captured = _make_fake_planner() + + batch_size = 1 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([20], dtype=torch.long), + "fps_is_pad": torch.tensor([False]), + "robot_type": ["franka"], + "control_mode": ["joint"], + } + + method(fake, batch) + + line = captured[0] + robot_idx = line.index("Robot: ") + fps_idx = line.index("fps: 20") + control_idx = line.index("Control: ") + assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}" + + @pytest.mark.parametrize("planner", ["low", "high"]) + def test_fps_with_partial_segments(self, planner): + """fps + a subset of other metadata still produces a well-formed prefix.""" + method = self._planner_methods()[planner] + fake, captured = _make_fake_planner() + + batch_size = 2 + # Sample 0: fps + robot_type only. Sample 1: fps + control_mode only. + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([30, 50], dtype=torch.long), + "fps_is_pad": torch.tensor([False, False]), + "robot_type": ["franka", ""], + "control_mode": ["", "joint"], + } + + method(fake, batch) + + assert captured[0].startswith("Metadata: ") + assert "Robot: franka, " in captured[0] + assert "fps: 30, " in captured[0] + assert "Control:" not in captured[0] + + assert captured[1].startswith("Metadata: ") + assert "Robot:" not in captured[1] + assert "fps: 50, " in captured[1] + assert "Control: joint, " in captured[1] + + # `embed_prefix` conditional-block guards # # The low-level component skips entire prefix blocks when their availability diff --git a/tests/policies/test_pi07_paligemma_high_level_planner.py b/tests/policies/test_pi07_paligemma_high_level_planner.py index a707e653..3014278b 100644 --- a/tests/policies/test_pi07_paligemma_high_level_planner.py +++ b/tests/policies/test_pi07_paligemma_high_level_planner.py @@ -322,3 +322,103 @@ def capture_embed_prefix_infer(*args, **kwargs): # Output shapes. assert memory_tokens.shape == (1, MEMORY_MAX_LENGTH) assert response_tokens.shape == (1, RESPONSE_MAX_LENGTH) + + +# CPU-only tests for the metadata-string assembly loop in +# ``PI07HighLevelPlannerPolicy.prepare_metadata``. Pin the contract that ``fps`` +# is tokenized with the lowercase ``"fps: N, "`` header positioned between +# ``Speed/Quality/Mistake`` (this planner has no ``Robot:``/``Control:`` +# segments) and omitted entirely when ``fps_is_pad`` is True. + + +def _pg_hl_make_fake_planner(metadata_max_length: int = 4): + """Construct a minimal ``self`` stand-in for the high-level pi07_paligemma + planner's ``prepare_metadata`` method. + + The method only touches ``self.language_tokenizer`` and + ``self.config.metadata_max_length`` (no ``_hydrate_metadata_batch``), so a + plain SimpleNamespace with a stub tokenizer is enough — no PaliGemma + backbone, no HuggingFace download. + """ + import types + + captured: list[str] = [] + + def stub_call(metadata, **kwargs): + captured.extend(metadata) + batch_size = len(metadata) + max_length = kwargs.get("max_length", 4) + return { + "input_ids": torch.zeros((batch_size, max_length), dtype=torch.long), + "attention_mask": torch.zeros((batch_size, max_length), dtype=torch.long), + } + + tokenizer = types.SimpleNamespace() + tokenizer.__call__ = stub_call + fake = types.SimpleNamespace( + language_tokenizer=tokenizer, + config=types.SimpleNamespace(metadata_max_length=metadata_max_length), + ) + return fake, captured + + +def _pg_hl_base_batch(batch_size: int) -> dict: + """Build a batch with the mandatory speed/quality/mistake keys. + + Unlike pi07's ``prepare_metadata``, this planner reads ``batch["speed"]`` + etc. directly (no ``batch.get`` fallback), so every required numeric + + ``_is_pad`` key must be present. + """ + return { + "state": torch.zeros(batch_size, 1), + "speed": torch.tensor([50.0] * batch_size, dtype=torch.float32), + "quality": torch.tensor([3.0] * batch_size, dtype=torch.float32), + "mistake": torch.tensor([False] * batch_size, dtype=torch.bool), + "speed_is_pad": torch.tensor([True] * batch_size), + "quality_is_pad": torch.tensor([True] * batch_size), + "mistake_is_pad": torch.tensor([True] * batch_size), + } + + +class TestPaligemmaHighLevelFpsSegment: + def test_fps_present_emits_lowercase_segment(self): + method = PI07HighLevelPlannerPolicy.prepare_metadata + fake, captured = _pg_hl_make_fake_planner() + + batch = _pg_hl_base_batch(batch_size=2) + batch["fps"] = torch.tensor([30, 50], dtype=torch.long) + batch["fps_is_pad"] = torch.tensor([False, False]) + + method(fake, batch) + + assert len(captured) == 2 + for line, fps in zip(captured, [30, 50], strict=True): + assert line.startswith("Metadata: ") + assert f"fps: {fps}, " in line + assert "Fps:" not in line and "FPS:" not in line + + def test_fps_padded_omits_segment(self): + method = PI07HighLevelPlannerPolicy.prepare_metadata + fake, captured = _pg_hl_make_fake_planner() + + batch = _pg_hl_base_batch(batch_size=1) + batch["fps"] = torch.tensor([30], dtype=torch.long) + batch["fps_is_pad"] = torch.tensor([True]) + + method(fake, batch) + + assert "fps:" not in captured[0] + # No stray comma where the fps segment would have lived. + assert ", ," not in captured[0] + + def test_fps_key_absent_omits_segment(self): + method = PI07HighLevelPlannerPolicy.prepare_metadata + fake, captured = _pg_hl_make_fake_planner() + + # Note: no "fps" / "fps_is_pad" keys at all — the `batch.get` defaults + # should kick in and produce no segment. + batch = _pg_hl_base_batch(batch_size=3) + method(fake, batch) + + for line in captured: + assert "fps:" not in line diff --git a/tests/policies/test_pi07_paligemma_low_level.py b/tests/policies/test_pi07_paligemma_low_level.py index e09dca73..366c3593 100644 --- a/tests/policies/test_pi07_paligemma_low_level.py +++ b/tests/policies/test_pi07_paligemma_low_level.py @@ -2253,3 +2253,130 @@ def fake_sample_actions(batch, noise=None, action_prefix=None, delay=None): a_next = PI07PaligemmaLowLevelPolicy.select_action(policy, batch) assert calls["n"] == 2 assert a_next[0, 0].item() == 2000.0 + + +# CPU-only tests for the metadata-string assembly loop in +# ``PI07PaligemmaLowLevelPolicy.prepare_metadata``. Pin the contract that +# ``fps`` is tokenized with the lowercase ``"fps: N, "`` header positioned +# between ``Robot:`` and ``Control:`` and omitted entirely when ``fps_is_pad`` +# is True (or when the keys are missing — ``_hydrate_metadata_batch`` defaults +# them to all-padded). + + +def _pg_ll_make_fake_policy(metadata_max_length: int = 4): + """Construct a stubbed PI07PaligemmaLowLevelPolicy for prepare_metadata tests. + + ``prepare_metadata`` calls ``self._hydrate_metadata_batch(batch)`` first, + so a bare ``SimpleNamespace`` doesn't work — we use + ``object.__new__(PI07PaligemmaLowLevelPolicy)`` to get a real instance + with the class methods bound, then stub only the tokenizer + config to + skip the PaliGemma backbone download. + """ + import types + + captured: list[str] = [] + + def stub_call(metadata, **kwargs): + captured.extend(metadata) + batch_size = len(metadata) + max_length = kwargs.get("max_length", 4) + return { + "input_ids": torch.zeros((batch_size, max_length), dtype=torch.long), + "attention_mask": torch.zeros((batch_size, max_length), dtype=torch.long), + } + + tokenizer = types.SimpleNamespace() + tokenizer.__call__ = stub_call + + policy = object.__new__(PI07PaligemmaLowLevelPolicy) + policy.language_tokenizer = tokenizer + policy.config = types.SimpleNamespace(metadata_max_length=metadata_max_length) + return policy, captured + + +class TestPaligemmaLowLevelFpsSegment: + def test_fps_present_emits_lowercase_segment(self): + policy, captured = _pg_ll_make_fake_policy() + batch_size = 2 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([30, 50], dtype=torch.long), + "fps_is_pad": torch.tensor([False, False]), + } + # _hydrate_metadata_batch will fill in speed/quality/mistake/* defaults + # (all padded) and empty robot_type/control_mode lists. + + PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) + + assert len(captured) == batch_size + for line, fps in zip(captured, [30, 50], strict=True): + assert line.startswith("Metadata: ") + assert f"fps: {fps}, " in line + assert "Fps:" not in line and "FPS:" not in line + + def test_fps_padded_omits_segment(self): + policy, captured = _pg_ll_make_fake_policy() + batch_size = 1 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([30], dtype=torch.long), + "fps_is_pad": torch.tensor([True]), + "robot_type": ["franka"], + } + + PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) + + assert "fps:" not in captured[0] + assert ", ," not in captured[0] + assert "Robot: franka, " in captured[0] + + def test_fps_key_absent_omits_segment(self): + """``_hydrate_metadata_batch`` defaults ``fps_is_pad`` to True when + absent, so missing keys collapse to the same "no segment" behaviour + as explicit-pad.""" + policy, captured = _pg_ll_make_fake_policy() + batch_size = 3 + batch = {"state": torch.zeros(batch_size, 1)} + + PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) + + for line in captured: + assert "fps:" not in line + + def test_fps_slots_between_robot_and_control(self): + policy, captured = _pg_ll_make_fake_policy() + batch_size = 1 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([20], dtype=torch.long), + "fps_is_pad": torch.tensor([False]), + "robot_type": ["franka"], + "control_mode": ["joint"], + } + + PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) + + line = captured[0] + robot_idx = line.index("Robot: ") + fps_idx = line.index("fps: 20") + control_idx = line.index("Control: ") + assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}" + + def test_fps_default_torch_long(self): + """fps is hydrated with ``torch.long`` defaults; assert ``_row_long`` + preserves the dtype through the broadcast path.""" + policy, captured = _pg_ll_make_fake_policy() + # Scalar-broadcast fps (one element, fan out to (B,)). + batch_size = 3 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([25], dtype=torch.long), + "fps_is_pad": torch.tensor([False]), + } + + PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) + + # All three samples get fps: 25 from the scalar-broadcast. + assert len(captured) == 3 + for line in captured: + assert "fps: 25, " in line From 7236b5aec1a01a45d3abde7316009476015ef488 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 22 May 2026 19:37:41 -0700 Subject: [PATCH 2/7] fix(datasets): emit effective fps, not native, when action_freq is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_emit_optional_keys` previously emitted `meta.fps` regardless of the mixture's `action_freq`. When the user pinned `action_freq=30.0` on a mixture containing a 50 Hz dataset, `resolve_delta_timestamps` resampled the chunk to 30 Hz but the prefix still said `fps: 50, ` — telling the policy a different rate than the chunk actually carried. Capture `_action_freq` on `BaseDataset` and emit `action_freq if set else meta.fps` so the tokenized fps always matches the chunk's effective rate. --- src/opentau/datasets/lerobot_dataset.py | 39 ++++++++++++++++--------- tests/datasets/test_optional_keys.py | 27 +++++++++++++++++ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py index 9b3303ae..8fc43544 100644 --- a/src/opentau/datasets/lerobot_dataset.py +++ b/src/opentau/datasets/lerobot_dataset.py @@ -698,10 +698,12 @@ def __init__(self, cfg: TrainPipelineConfig): self.response_drop_prob = dm.response_drop_prob if dm else 0.0 self.metadata_drop_all_prob = dm.metadata_drop_all_prob if dm else 0.0 self.metadata_drop_each_prob = dm.metadata_drop_each_prob if dm else 0.0 - # Whether `_emit_optional_keys` injects the dataset's native `meta.fps` - # under the `fps` / `fps_is_pad` keys. Independent of the dropout rolls - # above — fps is intrinsic to the dataset, not a noisy label. + # `_emit_optional_keys` emits the *effective* per-sample fps when this + # is True: the mixture's `action_freq` if set (every dataset has been + # resampled to that rate via `resolve_delta_timestamps`), else the + # dataset's native `meta.fps`. Independent of the dropout rolls above. self.emit_fps = dm.emit_fps if dm else True + self._action_freq = dm.action_freq if dm else None # Whether the above drop rolls actually fire. `make_dataset` flips this # off on the validation subset (unless `val_enable_optional_key_dropout` # is set). Subgoal *frame* selection (end-of-segment vs. uniform window) @@ -879,14 +881,16 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None: pad signal — a consumer seeing ``""`` can assume the field was unavailable or was masked this step. - ``fps`` (the dataset's native frame rate) is emitted as a - ``torch.long`` scalar alongside ``fps_is_pad`` when - ``self.emit_fps`` is True. Unlike the other metadata fields, ``fps`` - does **not** participate in the dropout rolls — it's an intrinsic - property of the dataset, not a noisy label — so it's always - non-padded when emitted. Set ``emit_fps=False`` on the mixture to - omit the keys entirely (the policy's ``prepare_metadata`` falls - through to its default-pad path). + ``fps`` (the *effective* per-sample frame rate — ``action_freq`` if + set on the mixture, else the dataset's native ``meta.fps``) is + emitted as a ``torch.long`` scalar alongside ``fps_is_pad`` when + ``self.emit_fps`` is True. Unlike the other metadata fields, + ``fps`` does **not** participate in the dropout rolls — it's an + intrinsic property of the (possibly resampled) chunk, not a noisy + label — so it's always non-padded when emitted. Set + ``emit_fps=False`` on the mixture to omit the keys entirely (the + policy's ``prepare_metadata`` falls through to its default-pad + path). Dropout order: 1. ``history_state_drop_prob``: zero ``state`` and historical camera @@ -989,10 +993,17 @@ def _roll(prob: float) -> bool: drop_this = drop_meta_all or _roll(self.metadata_drop_each_prob) standard_item[key] = "" if drop_this else val - # (6) Native fps: intrinsic dataset property, not a noisy label. Emitted - # unconditionally (no dropout) when `emit_fps` is enabled on the mixture. + # (6) Effective fps: the rate the action chunk actually runs at after + # any mixture-level resampling. When `action_freq` is set, every + # dataset's chunks are nearest-neighbor resampled to that rate by + # `resolve_delta_timestamps`, so tokenizing the dataset's native + # `meta.fps` would mislead the policy (the chunk timing differs). + # When `action_freq is None` (no resampling), the chunk runs at the + # dataset's native rate. Always non-pad (no dropout) when emit_fps + # is enabled on the mixture. if self.emit_fps: - standard_item["fps"] = torch.tensor(int(self.meta.fps), dtype=torch.long) + effective_fps = int(self._action_freq) if self._action_freq is not None else int(self.meta.fps) + standard_item["fps"] = torch.tensor(effective_fps, dtype=torch.long) standard_item["fps_is_pad"] = torch.tensor(False) def resize_with_pad(self, img, width, height, pad_value=0) -> torch.Tensor: diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py index 54d67071..60ee2f58 100644 --- a/tests/datasets/test_optional_keys.py +++ b/tests/datasets/test_optional_keys.py @@ -65,6 +65,7 @@ def __init__( metadata_drop_all_prob=0.0, metadata_drop_each_prob=0.0, emit_fps=True, + action_freq: float | None = None, meta_info: dict | None = None, meta_fps: int = 30, ): @@ -82,6 +83,11 @@ def __init__( self.metadata_drop_all_prob = metadata_drop_all_prob self.metadata_drop_each_prob = metadata_drop_each_prob self.emit_fps = emit_fps + # Mirrors `BaseDataset._action_freq` (mixture-level `action_freq`). + # When set, `_emit_optional_keys` emits this value as the effective + # fps instead of `meta.fps` — matching the rate that + # `resolve_delta_timestamps` resampled the chunk to. + self._action_freq = action_freq self.enable_optional_key_dropout = True # Stub meta surface so _to_standard_data_format can read robot_type / # control_mode without instantiating a real LeRobotDatasetMetadata. @@ -764,3 +770,24 @@ def test_default_collate_stacks_mixed_fps(self): assert batch["fps"].dtype == torch.long assert batch["fps"].tolist() == [30, 50] assert batch["fps_is_pad"].tolist() == [False, False] + + def test_fps_reports_action_freq_when_resampling(self): + """When the mixture sets `action_freq`, every chunk is nearest-neighbor + resampled to that rate by `resolve_delta_timestamps`. The emitted + `fps` must reflect the *effective* rate of the resampled chunk, not + the dataset's native rate — otherwise the prefix tells the model a + different timing than the chunk actually carries. + """ + # Dataset is natively 50 Hz, but the mixture pins everything to 30 Hz. + ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=30.0) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert out["fps"].item() == 30 + assert out["fps_is_pad"].item() is False + + def test_fps_reports_native_when_action_freq_none(self): + """When `action_freq is None` (no resampling), the chunk runs at the + dataset's native rate — that's what gets emitted.""" + ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=None) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert out["fps"].item() == 50 + assert out["fps_is_pad"].item() is False From fe04ffd17d8e0051af27ae8c74d685950339bb5f Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 22 May 2026 19:41:59 -0700 Subject: [PATCH 3/7] style(pi07): capitalize fps header to FPS for sibling consistency --- .../modeling_pi07_high_level.py | 2 +- .../pi07/low_level/modeling_pi07_low_level.py | 2 +- .../modeling_pi07_high_level.py | 2 +- .../low_level/modeling_pi07_low_level.py | 13 +++---- tests/policies/test_pi07_cpu.py | 35 ++++++++++--------- .../test_pi07_paligemma_high_level_planner.py | 13 +++---- .../policies/test_pi07_paligemma_low_level.py | 27 +++++++------- 7 files changed, 49 insertions(+), 45 deletions(-) diff --git a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py index d320bce6..e24fbd74 100644 --- a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py +++ b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py @@ -740,7 +740,7 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: segments.append(f"Robot: {robot_type}, ") if not fps_is_pad: - segments.append(f"fps: {str(fps.item())}, ") + segments.append(f"FPS: {str(fps.item())}, ") if control_mode: segments.append(f"Control: {control_mode}, ") diff --git a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py index 6ca13d3c..f552a5ec 100644 --- a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py +++ b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py @@ -1140,7 +1140,7 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: segments.append(f"Robot: {robot_type}, ") if not fps_is_pad: - segments.append(f"fps: {str(fps.item())}, ") + segments.append(f"FPS: {str(fps.item())}, ") if control_mode: segments.append(f"Control: {control_mode}, ") diff --git a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py index 686e4f38..3e600da1 100644 --- a/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py +++ b/src/opentau/policies/pi07_paligemma/high_level_planner/modeling_pi07_high_level.py @@ -696,7 +696,7 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: segments.append(f"Mistake: {str(mistake.item())}, ") if not fps_is_pad: - segments.append(f"fps: {str(fps.item())}, ") + segments.append(f"FPS: {str(fps.item())}, ") metadata.append(f"Metadata: {' '.join(segments)}") diff --git a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py index 831585a6..c5c308d6 100644 --- a/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py +++ b/src/opentau/policies/pi07_paligemma/low_level/modeling_pi07_low_level.py @@ -1385,16 +1385,17 @@ def prepare_metadata(self, batch: dict[str, Tensor]) -> tuple[Tensor, Tensor]: """Tokenize episode metadata into PaliGemma token IDs. Builds strings ``Metadata: Speed: … Quality: … Mistake: … Robot: … - fps: … Control: …`` only from fields whose ``*_is_pad`` flag is + FPS: … Control: …`` only from fields whose ``*_is_pad`` flag is ``False`` (``robot_type`` / ``control_mode`` are strings, omitted when empty). For each sample, a padded or empty field is omitted entirely (not concatenated to ``segments``). When every field is pad, the row is ``""`` before tokenization. - ``fps`` is the dataset's native frame rate (``torch.long``); the - segment is omitted when ``fps_is_pad`` is True. Segment ordering is - ``Speed → Quality → Mistake → Robot → fps → Control`` to match the - other pi07 / pi07_paligemma ``prepare_metadata`` implementations. + ``fps`` is the effective per-sample frame rate (``torch.long``); + the segment is omitted when ``fps_is_pad`` is True. Segment + ordering is ``Speed → Quality → Mistake → Robot → FPS → Control`` + to match the other pi07 / pi07_paligemma ``prepare_metadata`` + implementations. Always runs :meth:`_hydrate_metadata_batch` first so callers see the same outcomes whether they hit :meth:`sample_actions` (often no metadata @@ -1512,7 +1513,7 @@ def _row_bool(key: str) -> Tensor: if robot_type: segments.append(f"Robot: {robot_type}, ") if not fps_is_pad.item(): - segments.append(f"fps: {str(fps.item())}, ") + segments.append(f"FPS: {str(fps.item())}, ") if control_mode: segments.append(f"Control: {control_mode}, ") metadata_rows.append(f"Metadata: {' '.join(segments)}" if segments else "") diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py index 59fde78d..c710f680 100644 --- a/tests/policies/test_pi07_cpu.py +++ b/tests/policies/test_pi07_cpu.py @@ -545,11 +545,12 @@ def test_low_level_missing_speed_pad_does_not_fabricate_value(self): class TestPrepareMetadataFps: """Verify the ``fps`` segment construction in the pi07 planners. - ``fps`` is the dataset's native frame rate. The planner tokenizes it - with the lowercase ``"fps: N, "`` header, positioned between - ``Robot:`` and ``Control:`` for consistency with the - ``Speed → Quality → Mistake → Robot → fps → Control`` ordering pinned - in the segment-assembly loop of all four pi07 / pi07_paligemma + ``fps`` is the effective per-sample frame rate. The planner tokenizes + it with the ``"FPS: N, "`` header (uppercase to match the sibling + ``Speed:``/``Quality:``/``Mistake:``/``Robot:``/``Control:`` labels), + positioned between ``Robot:`` and ``Control:`` for consistency with + the ``Speed → Quality → Mistake → Robot → FPS → Control`` ordering + pinned in the segment-assembly loop of all four pi07 / pi07_paligemma modelling files. The segment is omitted entirely (no header, no comma) when ``fps_is_pad`` is ``True`` or when the ``fps`` key is absent from the batch. @@ -570,8 +571,8 @@ def _planner_methods(): } @pytest.mark.parametrize("planner", ["low", "high"]) - def test_fps_present_emits_lowercase_segment(self, planner): - """``fps=30`` non-pad → ``"fps: 30, "`` appears in the prefix.""" + def test_fps_present_emits_uppercase_segment(self, planner): + """``fps=30`` non-pad → ``"FPS: 30, "`` appears in the prefix.""" method = self._planner_methods()[planner] fake, captured = _make_fake_planner() @@ -587,14 +588,14 @@ def test_fps_present_emits_lowercase_segment(self, planner): assert len(captured) == batch_size for line, fps in zip(captured, [30, 50], strict=True): assert line.startswith("Metadata: ") - assert f"fps: {fps}, " in line - # Lowercase per spec — must NOT be "Fps:" or "FPS:". + assert f"FPS: {fps}, " in line + # Uppercase per spec — must NOT be "fps:" or "Fps:". + assert "fps:" not in line assert "Fps:" not in line - assert "FPS:" not in line @pytest.mark.parametrize("planner", ["low", "high"]) def test_fps_padded_omits_segment(self, planner): - """``fps_is_pad=True`` → no ``"fps:"`` substring, no stray comma.""" + """``fps_is_pad=True`` → no ``"FPS:"`` substring, no stray comma.""" method = self._planner_methods()[planner] fake, captured = _make_fake_planner() @@ -609,7 +610,7 @@ def test_fps_padded_omits_segment(self, planner): method(fake, batch) for line in captured: - assert "fps:" not in line + assert "FPS:" not in line # No stray double commas where the fps segment would have lived. assert ", ," not in line # Robot segment still present (unaffected by fps padding). @@ -630,7 +631,7 @@ def test_fps_key_absent_omits_segment(self, planner): method(fake, batch) for line in captured: - assert "fps:" not in line + assert "FPS:" not in line @pytest.mark.parametrize("planner", ["low", "high"]) def test_fps_slots_between_robot_and_control(self, planner): @@ -651,9 +652,9 @@ def test_fps_slots_between_robot_and_control(self, planner): line = captured[0] robot_idx = line.index("Robot: ") - fps_idx = line.index("fps: 20") + fps_idx = line.index("FPS: 20") control_idx = line.index("Control: ") - assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}" + assert robot_idx < fps_idx < control_idx, f"Expected Robot < FPS < Control ordering in {line!r}" @pytest.mark.parametrize("planner", ["low", "high"]) def test_fps_with_partial_segments(self, planner): @@ -675,12 +676,12 @@ def test_fps_with_partial_segments(self, planner): assert captured[0].startswith("Metadata: ") assert "Robot: franka, " in captured[0] - assert "fps: 30, " in captured[0] + assert "FPS: 30, " in captured[0] assert "Control:" not in captured[0] assert captured[1].startswith("Metadata: ") assert "Robot:" not in captured[1] - assert "fps: 50, " in captured[1] + assert "FPS: 50, " in captured[1] assert "Control: joint, " in captured[1] diff --git a/tests/policies/test_pi07_paligemma_high_level_planner.py b/tests/policies/test_pi07_paligemma_high_level_planner.py index 3014278b..bc6348a6 100644 --- a/tests/policies/test_pi07_paligemma_high_level_planner.py +++ b/tests/policies/test_pi07_paligemma_high_level_planner.py @@ -326,7 +326,8 @@ def capture_embed_prefix_infer(*args, **kwargs): # CPU-only tests for the metadata-string assembly loop in # ``PI07HighLevelPlannerPolicy.prepare_metadata``. Pin the contract that ``fps`` -# is tokenized with the lowercase ``"fps: N, "`` header positioned between +# is tokenized with the ``"FPS: N, "`` header (uppercase to match the sibling +# ``Speed:``/``Quality:``/``Mistake:`` labels) positioned after # ``Speed/Quality/Mistake`` (this planner has no ``Robot:``/``Control:`` # segments) and omitted entirely when ``fps_is_pad`` is True. @@ -381,7 +382,7 @@ def _pg_hl_base_batch(batch_size: int) -> dict: class TestPaligemmaHighLevelFpsSegment: - def test_fps_present_emits_lowercase_segment(self): + def test_fps_present_emits_uppercase_segment(self): method = PI07HighLevelPlannerPolicy.prepare_metadata fake, captured = _pg_hl_make_fake_planner() @@ -394,8 +395,8 @@ def test_fps_present_emits_lowercase_segment(self): assert len(captured) == 2 for line, fps in zip(captured, [30, 50], strict=True): assert line.startswith("Metadata: ") - assert f"fps: {fps}, " in line - assert "Fps:" not in line and "FPS:" not in line + assert f"FPS: {fps}, " in line + assert "fps:" not in line and "Fps:" not in line def test_fps_padded_omits_segment(self): method = PI07HighLevelPlannerPolicy.prepare_metadata @@ -407,7 +408,7 @@ def test_fps_padded_omits_segment(self): method(fake, batch) - assert "fps:" not in captured[0] + assert "FPS:" not in captured[0] # No stray comma where the fps segment would have lived. assert ", ," not in captured[0] @@ -421,4 +422,4 @@ def test_fps_key_absent_omits_segment(self): method(fake, batch) for line in captured: - assert "fps:" not in line + assert "FPS:" not in line diff --git a/tests/policies/test_pi07_paligemma_low_level.py b/tests/policies/test_pi07_paligemma_low_level.py index 366c3593..aeced8f8 100644 --- a/tests/policies/test_pi07_paligemma_low_level.py +++ b/tests/policies/test_pi07_paligemma_low_level.py @@ -2257,10 +2257,11 @@ def fake_sample_actions(batch, noise=None, action_prefix=None, delay=None): # CPU-only tests for the metadata-string assembly loop in # ``PI07PaligemmaLowLevelPolicy.prepare_metadata``. Pin the contract that -# ``fps`` is tokenized with the lowercase ``"fps: N, "`` header positioned -# between ``Robot:`` and ``Control:`` and omitted entirely when ``fps_is_pad`` -# is True (or when the keys are missing — ``_hydrate_metadata_batch`` defaults -# them to all-padded). +# ``fps`` is tokenized with the ``"FPS: N, "`` header (uppercase to match +# the sibling ``Speed:``/``Quality:``/``Mistake:``/``Robot:``/``Control:`` +# labels) positioned between ``Robot:`` and ``Control:`` and omitted entirely +# when ``fps_is_pad`` is True (or when the keys are missing — +# ``_hydrate_metadata_batch`` defaults them to all-padded). def _pg_ll_make_fake_policy(metadata_max_length: int = 4): @@ -2295,7 +2296,7 @@ def stub_call(metadata, **kwargs): class TestPaligemmaLowLevelFpsSegment: - def test_fps_present_emits_lowercase_segment(self): + def test_fps_present_emits_uppercase_segment(self): policy, captured = _pg_ll_make_fake_policy() batch_size = 2 batch = { @@ -2311,8 +2312,8 @@ def test_fps_present_emits_lowercase_segment(self): assert len(captured) == batch_size for line, fps in zip(captured, [30, 50], strict=True): assert line.startswith("Metadata: ") - assert f"fps: {fps}, " in line - assert "Fps:" not in line and "FPS:" not in line + assert f"FPS: {fps}, " in line + assert "fps:" not in line and "Fps:" not in line def test_fps_padded_omits_segment(self): policy, captured = _pg_ll_make_fake_policy() @@ -2326,7 +2327,7 @@ def test_fps_padded_omits_segment(self): PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) - assert "fps:" not in captured[0] + assert "FPS:" not in captured[0] assert ", ," not in captured[0] assert "Robot: franka, " in captured[0] @@ -2341,7 +2342,7 @@ def test_fps_key_absent_omits_segment(self): PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) for line in captured: - assert "fps:" not in line + assert "FPS:" not in line def test_fps_slots_between_robot_and_control(self): policy, captured = _pg_ll_make_fake_policy() @@ -2358,9 +2359,9 @@ def test_fps_slots_between_robot_and_control(self): line = captured[0] robot_idx = line.index("Robot: ") - fps_idx = line.index("fps: 20") + fps_idx = line.index("FPS: 20") control_idx = line.index("Control: ") - assert robot_idx < fps_idx < control_idx, f"Expected Robot < fps < Control ordering in {line!r}" + assert robot_idx < fps_idx < control_idx, f"Expected Robot < FPS < Control ordering in {line!r}" def test_fps_default_torch_long(self): """fps is hydrated with ``torch.long`` defaults; assert ``_row_long`` @@ -2376,7 +2377,7 @@ def test_fps_default_torch_long(self): PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) - # All three samples get fps: 25 from the scalar-broadcast. + # All three samples get FPS: 25 from the scalar-broadcast. assert len(captured) == 3 for line in captured: - assert "fps: 25, " in line + assert "FPS: 25, " in line From 07ce696a3bb428eac4a216899303a9328f7dfb58 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 22 May 2026 19:54:26 -0700 Subject: [PATCH 4/7] docs(concepts): document fps in standard data format --- docs/source/concepts.rst | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst index 200034f3..22413879 100644 --- a/docs/source/concepts.rst +++ b/docs/source/concepts.rst @@ -28,7 +28,7 @@ This class: * Combines multiple ``LeRobotDataset`` and ``VQADataset`` instances. * Different weights can be assigned to each dataset to control the sampling frequency; if weights are omitted (or set to ``null`` in JSON), weights default to dataset lengths. * Aggregates statistics from all constituent datasets to ensure consistent normalization across the mixture. -* Resamples the action output frequency to match the action frequency specified in the configuration. +* Resamples the action output frequency to match the ``action_freq`` specified in the configuration. When ``action_freq`` is ``None`` (the default), resampling is disabled — each dataset is sampled at its native fps and a single batch can mix samples spanning different real-time horizons (set ``action_freq`` to a positive float to re-anchor every dataset to a common rate). The per-sample effective rate is surfaced via the ``fps`` standard-format key (see :ref:`standard-data-format-optional-keys`) so the policy can condition on it. Metadata ^^^^^^^^ @@ -130,6 +130,21 @@ Like ``speed``, ``mistake``, and ``quality``, they participate in the ``metadata_drop_all_prob`` / ``metadata_drop_each_prob`` dropout rolls — see :ref:`Training-time dropout `. +``fps`` is the **effective per-sample frame rate** of the (possibly +resampled) action chunk: ``DatasetMixtureConfig.action_freq`` when set, +otherwise the dataset's native ``meta.fps``. Heterogeneous-frequency +mixtures (``action_freq=None``) need it so the policy can condition on +each sample's rate — a 30 Hz chunk and a 50 Hz chunk carry different +real-time horizons even when both are ``chunk_size`` frames long. Unlike +the other metadata fields, ``fps`` does **not** participate in the +dropout rolls — it's an intrinsic property of the chunk, not a noisy +label, so it's always non-pad when emitted. Emission is gated by +``DatasetMixtureConfig.emit_fps`` (default ``True``); set it to +``False`` to omit both ``fps`` and ``fps_is_pad`` from the sample dict +(useful when resuming a checkpoint that was trained without fps +conditioning, so the policy's metadata prefix doesn't gain an unfamiliar +``FPS:`` segment). + .. code-block:: python { @@ -182,6 +197,19 @@ see :ref:`Training-time dropout `. "quality": torch.LongTensor, # Scalar in {1,2,3,4,5}; episode-level quality score. "quality_is_pad": torch.BoolTensor, + "fps": torch.LongTensor, # Scalar; effective per-sample frame rate of the action chunk. + # When `DatasetMixtureConfig.action_freq` is set, every dataset is + # resampled to that rate (via `resolve_delta_timestamps`) and `fps` + # reports `action_freq`. When `action_freq is None` (the default), + # the chunk runs at the dataset's native `meta.fps`. Gated by + # `DatasetMixtureConfig.emit_fps` (default `True`); the key is + # omitted entirely when `emit_fps=False`. Does NOT participate in + # `metadata_drop_*_prob` — fps is an intrinsic property of the + # chunk, not a noisy label. + "fps_is_pad": torch.BoolTensor, # Always False when emitted (no dropout). Defaults to True only + # if a hand-built inference batch supplies `fps` without the + # paired flag — `prepare_metadata` then drops the segment. + "subgoal0": torch.Tensor, # shape (3, H, W), values in [0,1]. A single future frame from # camera0 sampled either at end-of-segment (with probability # `subgoal_end_of_segment_prob`) or uniformly in [t, t+4 seconds]. From 090b43659e5ed3d13a17f28d31fe2ffdf085c4a6 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 22 May 2026 19:59:52 -0700 Subject: [PATCH 5/7] fix(datasets): emit_fps no longer crashes on VQA samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_emit_optional_keys` accessed `self.meta.fps` unconditionally when `emit_fps=True`, raising AttributeError on VQA datasets — whose `VQADatasetMetadata` (inheriting `pass` from `DatasetMetadata`) has no `fps` property. Any heterogeneous VLA + VQA mixture trained under the default config crashed at the first VQA sample fetch. Surface `fps -> None` on the `DatasetMetadata` base (overridden to int on `LeRobotDatasetMetadata`), then have `_emit_optional_keys` pad out the row (`fps=0, fps_is_pad=True`) when meta.fps is None. The policy's `prepare_metadata` already drops the `FPS:` segment for padded samples, so heterogeneous batches stay schema-aligned and the prefix omits the fps token for VQA rows. --- docs/source/concepts.rst | 6 ++- src/opentau/datasets/lerobot_dataset.py | 50 +++++++++++++---- tests/datasets/test_optional_keys.py | 71 +++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 11 deletions(-) diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst index 22413879..b0d790f9 100644 --- a/docs/source/concepts.rst +++ b/docs/source/concepts.rst @@ -138,7 +138,11 @@ each sample's rate — a 30 Hz chunk and a 50 Hz chunk carry different real-time horizons even when both are ``chunk_size`` frames long. Unlike the other metadata fields, ``fps`` does **not** participate in the dropout rolls — it's an intrinsic property of the chunk, not a noisy -label, so it's always non-pad when emitted. Emission is gated by +label, so it's always non-pad when emitted from a dataset that has a +real frame rate. Samples from VQA datasets (no temporal axis) emit +``fps=0, fps_is_pad=True`` so heterogeneous VLA + VQA mixtures stay +schema-aligned across the batch; the policy's ``prepare_metadata`` +then drops the ``FPS:`` segment for those rows. Emission is gated by ``DatasetMixtureConfig.emit_fps`` (default ``True``); set it to ``False`` to omit both ``fps`` and ``fps_is_pad`` from the sample dict (useful when resuming a checkpoint that was trained without fps diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py index 8fc43544..b756f821 100644 --- a/src/opentau/datasets/lerobot_dataset.py +++ b/src/opentau/datasets/lerobot_dataset.py @@ -301,9 +301,28 @@ def shapes(self) -> dict: """Shapes for the different features.""" return {key: tuple(ft["shape"]) for key, ft in self.features.items()} + @property + def fps(self) -> int | None: + """Native frame rate of the dataset. + + Returns ``None`` for datasets without a temporal axis (e.g. VQA + image-text datasets). Subclasses with real frame-rate metadata + (``LeRobotDatasetMetadata``) override this to return the int from + ``info["fps"]``. Downstream callers that emit fps as a sample key + treat ``None`` as the pad signal so heterogeneous mixtures + (VLA + VQA) stay batchable. + """ + return None + class VQADatasetMetadata(DatasetMetadata): - """Metadata class for vqa datasets (vision-language datasets).""" + """Metadata class for vqa datasets (vision-language datasets). + + Inherits ``fps -> None`` from :class:`DatasetMetadata` since VQA + samples have no temporal axis. :meth:`BaseDataset._emit_optional_keys` + sees this and emits ``fps_is_pad=True`` for VQA samples in a mixture, + keeping every batch row schema-aligned with the LeRobot samples. + """ pass @@ -887,10 +906,13 @@ def _emit_optional_keys(self, item: dict, standard_item: dict) -> None: ``self.emit_fps`` is True. Unlike the other metadata fields, ``fps`` does **not** participate in the dropout rolls — it's an intrinsic property of the (possibly resampled) chunk, not a noisy - label — so it's always non-padded when emitted. Set - ``emit_fps=False`` on the mixture to omit the keys entirely (the - policy's ``prepare_metadata`` falls through to its default-pad - path). + label — so it's always non-padded for samples that have a real + frame rate. Samples from VQA datasets (where + :attr:`DatasetMetadata.fps` is ``None``) emit + ``fps=0, fps_is_pad=True`` so a heterogeneous VLA + VQA mixture + stays schema-aligned across the batch. Set ``emit_fps=False`` on + the mixture to omit the keys entirely (the policy's + ``prepare_metadata`` falls through to its default-pad path). Dropout order: 1. ``history_state_drop_prob``: zero ``state`` and historical camera @@ -999,12 +1021,20 @@ def _roll(prob: float) -> bool: # `resolve_delta_timestamps`, so tokenizing the dataset's native # `meta.fps` would mislead the policy (the chunk timing differs). # When `action_freq is None` (no resampling), the chunk runs at the - # dataset's native rate. Always non-pad (no dropout) when emit_fps - # is enabled on the mixture. + # dataset's native rate. Datasets without a temporal axis (VQA; + # `DatasetMetadata.fps` returns ``None``) emit pad so heterogeneous + # VLA + VQA mixtures stay schema-aligned across the batch. Always + # non-pad (no dropout) for real-rate samples when emit_fps is + # enabled on the mixture. if self.emit_fps: - effective_fps = int(self._action_freq) if self._action_freq is not None else int(self.meta.fps) - standard_item["fps"] = torch.tensor(effective_fps, dtype=torch.long) - standard_item["fps_is_pad"] = torch.tensor(False) + native_fps = self.meta.fps + if native_fps is None: + standard_item["fps"] = torch.tensor(0, dtype=torch.long) + standard_item["fps_is_pad"] = torch.tensor(True) + else: + effective_fps = int(self._action_freq) if self._action_freq is not None else int(native_fps) + standard_item["fps"] = torch.tensor(effective_fps, dtype=torch.long) + standard_item["fps_is_pad"] = torch.tensor(False) def resize_with_pad(self, img, width, height, pad_value=0) -> torch.Tensor: """Resize an image to target dimensions with padding. diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py index 60ee2f58..81747bfa 100644 --- a/tests/datasets/test_optional_keys.py +++ b/tests/datasets/test_optional_keys.py @@ -791,3 +791,74 @@ def test_fps_reports_native_when_action_freq_none(self): out = ds._to_standard_data_format(_full_raw_item(ds)) assert out["fps"].item() == 50 assert out["fps_is_pad"].item() is False + + +# VQA datasets have no temporal axis — `DatasetMetadata.fps` returns ``None`` +# on the base class (and on ``VQADatasetMetadata``, which inherits via ``pass``). +# `_emit_optional_keys` must catch that and emit ``fps=0, fps_is_pad=True`` +# instead of crashing on ``int(None)`` — otherwise any heterogeneous mixture +# that includes a VQA dataset crashes at the first sample fetch under the +# default ``emit_fps=True``. + + +class TestFpsEmissionVQA: + def test_base_metadata_fps_returns_none(self): + """``DatasetMetadata.fps`` is ``None`` for non-LeRobot datasets.""" + from opentau.datasets.lerobot_dataset import DatasetMetadata, VQADatasetMetadata + + assert DatasetMetadata().fps is None + assert VQADatasetMetadata().fps is None + + def test_vqa_metadata_emits_pad_row(self): + """A dataset whose ``meta.fps`` is ``None`` (i.e. a VQA dataset) must + still produce a complete sample dict so heterogeneous batches stay + schema-aligned. The fps row is padded out as + ``fps=0, fps_is_pad=True`` (mirroring the speed/quality/mistake pad + shape) — the policy's ``prepare_metadata`` then drops the ``FPS:`` + segment for that sample. + """ + from opentau.datasets.lerobot_dataset import VQADatasetMetadata + + ds = _DummyBaseDataset(num_cams=1) + ds.meta = VQADatasetMetadata(info={"features": {}}) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert "fps" in out and "fps_is_pad" in out + assert out["fps"].dtype == torch.long + assert out["fps"].item() == 0 + assert out["fps_is_pad"].item() is True + + def test_vqa_pad_ignores_action_freq(self): + """Even when the mixture sets ``action_freq``, a VQA sample emits + pad — VQA has no actions to be at any rate, so reporting + ``action_freq`` would be misleading.""" + from opentau.datasets.lerobot_dataset import VQADatasetMetadata + + ds = _DummyBaseDataset(num_cams=1, action_freq=30.0) + ds.meta = VQADatasetMetadata(info={"features": {}}) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert out["fps"].item() == 0 + assert out["fps_is_pad"].item() is True + + def test_heterogeneous_vla_vqa_collate(self): + """The pad-row trick keeps batches schema-aligned across a + VLA + VQA mixture: default_collate stacks fps into a ``(B,)`` long + tensor and fps_is_pad into a ``(B,)`` bool tensor with the + per-sample on/off correctly set. + """ + from torch.utils.data import default_collate + + from opentau.datasets.lerobot_dataset import VQADatasetMetadata + + # LeRobot-shaped sample. + ds_vla = _DummyBaseDataset(num_cams=1, meta_fps=30) + item_vla = ds_vla._to_standard_data_format(_full_raw_item(ds_vla)) + + # VQA-shaped sample (same dummy class, real VQA metadata). + ds_vqa = _DummyBaseDataset(num_cams=1) + ds_vqa.meta = VQADatasetMetadata(info={"features": {}}) + item_vqa = ds_vqa._to_standard_data_format(_full_raw_item(ds_vqa)) + + batch = default_collate([item_vla, item_vqa]) + assert batch["fps"].dtype == torch.long + assert batch["fps"].tolist() == [30, 0] + assert batch["fps_is_pad"].tolist() == [False, True] From 1467b57435f55f03b70ff98573e1c1872eda43a6 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 22 May 2026 20:18:01 -0700 Subject: [PATCH 6/7] test+docs(fps): pin mixed-pad batch + note metadata_drop_all semantic shift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `test_fps_mixed_pad_across_batch` in all three fps test classes — closes the per-sample fps_is_pad coverage gap, now reached in production by heterogeneous LeRobot + VQA mixtures where the VQA pad row (fps=0, fps_is_pad=True) sits next to a LeRobot row in the same batch. - concepts.rst: document the metadata_drop_all_prob=1.0 semantic shift under emit_fps=True (no longer "no metadata at all" — it's "fps-only metadata") and note metadata_max_length=52 headroom. --- docs/source/concepts.rst | 21 +++++++++++++- tests/policies/test_pi07_cpu.py | 29 +++++++++++++++++++ .../test_pi07_paligemma_high_level_planner.py | 18 ++++++++++++ .../policies/test_pi07_paligemma_low_level.py | 18 ++++++++++++ 4 files changed, 85 insertions(+), 1 deletion(-) diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst index b0d790f9..f67f4b97 100644 --- a/docs/source/concepts.rst +++ b/docs/source/concepts.rst @@ -132,7 +132,14 @@ see :ref:`Training-time dropout `. ``fps`` is the **effective per-sample frame rate** of the (possibly resampled) action chunk: ``DatasetMixtureConfig.action_freq`` when set, -otherwise the dataset's native ``meta.fps``. Heterogeneous-frequency +otherwise the dataset's native ``meta.fps``. The tokenized ``FPS: N, `` +segment adds ~3-4 BPE tokens to the metadata prefix, which still fits +comfortably inside the default ``metadata_max_length=52`` — but long +``robot_type`` strings combined with a fully-populated metadata batch +leave less headroom than before, and the underlying tokenizer call uses +``truncation=True`` silently. Bump ``metadata_max_length`` (a field on +each pi07 / pi07_paligemma config) if you start seeing the trailing +``Control:`` segment get clipped. Heterogeneous-frequency mixtures (``action_freq=None``) need it so the policy can condition on each sample's rate — a 30 Hz chunk and a 50 Hz chunk carry different real-time horizons even when both are ``chunk_size`` frames long. Unlike @@ -289,6 +296,18 @@ for reproducibility. - Per-field independent mask roll for each of ``speed``, ``mistake``, ``quality``, ``robot_type``, ``control_mode``. Only rolled when the shared drop did not fire. + +.. note:: + + ``fps`` is **not** in either drop pool — when ``emit_fps=True`` it + stays non-pad for every LeRobot sample regardless of the rolls. This + shifts the semantics of ``metadata_drop_all_prob=1.0``: pre-PR it + meant "no metadata segment at all", post-PR it means "fps-only + metadata segment" (the policy's ``has_metadata`` branch sees a + non-empty metadata mask and keeps the ``Metadata: FPS: N, `` block + in the prefix). If you're running a no-metadata ablation, also set + ``emit_fps=False`` on the mixture (and ``EnvMetadataConfig.emit_fps=False`` + at eval) to recover the pre-PR behaviour. * - ``val_enable_optional_key_dropout`` - ``False`` - Whether the five drop rolls above also fire on the **validation** diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py index c710f680..6b0d2a7e 100644 --- a/tests/policies/test_pi07_cpu.py +++ b/tests/policies/test_pi07_cpu.py @@ -684,6 +684,35 @@ def test_fps_with_partial_segments(self, planner): assert "FPS: 50, " in captured[1] assert "Control: joint, " in captured[1] + @pytest.mark.parametrize("planner", ["low", "high"]) + def test_fps_mixed_pad_across_batch(self, planner): + """Per-sample ``fps_is_pad`` must be honored row-by-row — sample 0 with + ``fps_is_pad=False`` keeps its FPS segment while sample 1 with + ``fps_is_pad=True`` drops it. This is the production path now + triggered by heterogeneous LeRobot + VQA mixtures, where the VQA + pad row (``fps=0, fps_is_pad=True``) sits next to a LeRobot row + (``fps=30, fps_is_pad=False``) in the same batch. + """ + method = self._planner_methods()[planner] + fake, captured = _make_fake_planner() + + batch_size = 2 + batch = { + "state": torch.zeros(batch_size, 1), + "fps": torch.tensor([30, 0], dtype=torch.long), + "fps_is_pad": torch.tensor([False, True]), + "robot_type": ["franka", "franka"], + } + + method(fake, batch) + + # Sample 0 (LeRobot): non-pad → FPS segment present. + assert "FPS: 30, " in captured[0] + assert "Robot: franka, " in captured[0] + # Sample 1 (VQA): pad → FPS segment dropped, no stray "FPS: 0, ". + assert "FPS:" not in captured[1] + assert "Robot: franka, " in captured[1] + # `embed_prefix` conditional-block guards # diff --git a/tests/policies/test_pi07_paligemma_high_level_planner.py b/tests/policies/test_pi07_paligemma_high_level_planner.py index bc6348a6..376ee61b 100644 --- a/tests/policies/test_pi07_paligemma_high_level_planner.py +++ b/tests/policies/test_pi07_paligemma_high_level_planner.py @@ -423,3 +423,21 @@ def test_fps_key_absent_omits_segment(self): for line in captured: assert "FPS:" not in line + + def test_fps_mixed_pad_across_batch(self): + """Per-sample ``fps_is_pad`` must be honored row-by-row — production + path for heterogeneous LeRobot + VQA mixtures, where the VQA pad + row (``fps=0, fps_is_pad=True``) sits next to a LeRobot row + (``fps=30, fps_is_pad=False``) in the same batch. + """ + method = PI07HighLevelPlannerPolicy.prepare_metadata + fake, captured = _pg_hl_make_fake_planner() + + batch = _pg_hl_base_batch(batch_size=2) + batch["fps"] = torch.tensor([30, 0], dtype=torch.long) + batch["fps_is_pad"] = torch.tensor([False, True]) + + method(fake, batch) + + assert "FPS: 30, " in captured[0] + assert "FPS:" not in captured[1] diff --git a/tests/policies/test_pi07_paligemma_low_level.py b/tests/policies/test_pi07_paligemma_low_level.py index aeced8f8..35254878 100644 --- a/tests/policies/test_pi07_paligemma_low_level.py +++ b/tests/policies/test_pi07_paligemma_low_level.py @@ -2381,3 +2381,21 @@ def test_fps_default_torch_long(self): assert len(captured) == 3 for line in captured: assert "FPS: 25, " in line + + def test_fps_mixed_pad_across_batch(self): + """Per-sample ``fps_is_pad`` must be honored row-by-row — production + path for heterogeneous LeRobot + VQA mixtures, where the VQA pad + row (``fps=0, fps_is_pad=True``) sits next to a LeRobot row + (``fps=30, fps_is_pad=False``) in the same batch. + """ + policy, captured = _pg_ll_make_fake_policy() + batch = { + "state": torch.zeros(2, 1), + "fps": torch.tensor([30, 0], dtype=torch.long), + "fps_is_pad": torch.tensor([False, True]), + } + + PI07PaligemmaLowLevelPolicy.prepare_metadata(policy, batch) + + assert "FPS: 30, " in captured[0] + assert "FPS:" not in captured[1] From 60bc00c810b1eb84982ff85a49742ba22d0d70a9 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 22 May 2026 20:35:29 -0700 Subject: [PATCH 7/7] feat!(datasets): flip emit_fps default to False (opt-in) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `DatasetMixtureConfig.emit_fps` and `EnvMetadataConfig.emit_fps` now default to `False` so pre-PR checkpoints resume cleanly — the policy's metadata prefix doesn't gain an unfamiliar `FPS:` segment until the user explicitly opts in. `action_freq=None` (mixed-frequency training) stays the default; opt-in to `emit_fps=True` to surface per-sample fps conditioning to the policy. BaseDataset's dm-None fallback matched to False for the "no mixture config" path (VQA-only / unit tests). Docs + tests updated to reflect the new default — existing fps tests now set `emit_fps=True` explicitly when they exercise the emission path. --- docs/source/concepts.rst | 41 +++++++++---------- src/opentau/configs/default.py | 40 ++++++++++++------- src/opentau/datasets/lerobot_dataset.py | 5 ++- src/opentau/envs/configs.py | 9 +++-- tests/configs/test_default.py | 18 +++++---- tests/datasets/test_optional_keys.py | 43 ++++++++++++-------- tests/envs/test_configs.py | 15 +++---- tests/envs/test_utils.py | 52 ++++++++++++------------- 8 files changed, 128 insertions(+), 95 deletions(-) diff --git a/docs/source/concepts.rst b/docs/source/concepts.rst index f67f4b97..41d032d7 100644 --- a/docs/source/concepts.rst +++ b/docs/source/concepts.rst @@ -28,7 +28,7 @@ This class: * Combines multiple ``LeRobotDataset`` and ``VQADataset`` instances. * Different weights can be assigned to each dataset to control the sampling frequency; if weights are omitted (or set to ``null`` in JSON), weights default to dataset lengths. * Aggregates statistics from all constituent datasets to ensure consistent normalization across the mixture. -* Resamples the action output frequency to match the ``action_freq`` specified in the configuration. When ``action_freq`` is ``None`` (the default), resampling is disabled — each dataset is sampled at its native fps and a single batch can mix samples spanning different real-time horizons (set ``action_freq`` to a positive float to re-anchor every dataset to a common rate). The per-sample effective rate is surfaced via the ``fps`` standard-format key (see :ref:`standard-data-format-optional-keys`) so the policy can condition on it. +* Resamples the action output frequency to match the ``action_freq`` specified in the configuration. When ``action_freq`` is ``None`` (the default), resampling is disabled — each dataset is sampled at its native fps and a single batch can mix samples spanning different real-time horizons (set ``action_freq`` to a positive float to re-anchor every dataset to a common rate). When running mixed-frequency training, opt in to ``DatasetMixtureConfig.emit_fps=True`` so the per-sample effective rate is surfaced via the ``fps`` standard-format key (see :ref:`standard-data-format-optional-keys`) and the policy can condition on it. Metadata ^^^^^^^^ @@ -150,11 +150,13 @@ real frame rate. Samples from VQA datasets (no temporal axis) emit ``fps=0, fps_is_pad=True`` so heterogeneous VLA + VQA mixtures stay schema-aligned across the batch; the policy's ``prepare_metadata`` then drops the ``FPS:`` segment for those rows. Emission is gated by -``DatasetMixtureConfig.emit_fps`` (default ``True``); set it to -``False`` to omit both ``fps`` and ``fps_is_pad`` from the sample dict -(useful when resuming a checkpoint that was trained without fps -conditioning, so the policy's metadata prefix doesn't gain an unfamiliar -``FPS:`` segment). +``DatasetMixtureConfig.emit_fps`` (default ``False`` — pre-PR +checkpoints resume cleanly because the policy's metadata prefix doesn't +gain an unfamiliar ``FPS:`` segment). Flip to ``True`` for new training +runs that want per-sample fps conditioning, especially heterogeneous +mixtures where ``action_freq=None`` lets each dataset run at its native +rate. At inference, ``EnvMetadataConfig.emit_fps`` (same default +``False``) gates the eval-side broadcast of ``cfg.env.fps``. .. code-block:: python @@ -213,13 +215,14 @@ conditioning, so the policy's metadata prefix doesn't gain an unfamiliar # resampled to that rate (via `resolve_delta_timestamps`) and `fps` # reports `action_freq`. When `action_freq is None` (the default), # the chunk runs at the dataset's native `meta.fps`. Gated by - # `DatasetMixtureConfig.emit_fps` (default `True`); the key is - # omitted entirely when `emit_fps=False`. Does NOT participate in - # `metadata_drop_*_prob` — fps is an intrinsic property of the - # chunk, not a noisy label. - "fps_is_pad": torch.BoolTensor, # Always False when emitted (no dropout). Defaults to True only - # if a hand-built inference batch supplies `fps` without the - # paired flag — `prepare_metadata` then drops the segment. + # `DatasetMixtureConfig.emit_fps` (default `False` — opt-in); the + # key is omitted entirely when `emit_fps=False`. Does NOT + # participate in `metadata_drop_*_prob` — fps is an intrinsic + # property of the chunk, not a noisy label. + "fps_is_pad": torch.BoolTensor, # Always False when emitted from a real-rate sample. VQA samples + # (no temporal axis) emit `fps_is_pad=True` so heterogeneous + # VLA + VQA batches stay schema-aligned; `prepare_metadata` + # then drops the `FPS:` segment for those rows. "subgoal0": torch.Tensor, # shape (3, H, W), values in [0,1]. A single future frame from # camera0 sampled either at end-of-segment (with probability @@ -301,13 +304,11 @@ for reproducibility. ``fps`` is **not** in either drop pool — when ``emit_fps=True`` it stays non-pad for every LeRobot sample regardless of the rolls. This - shifts the semantics of ``metadata_drop_all_prob=1.0``: pre-PR it - meant "no metadata segment at all", post-PR it means "fps-only - metadata segment" (the policy's ``has_metadata`` branch sees a - non-empty metadata mask and keeps the ``Metadata: FPS: N, `` block - in the prefix). If you're running a no-metadata ablation, also set - ``emit_fps=False`` on the mixture (and ``EnvMetadataConfig.emit_fps=False`` - at eval) to recover the pre-PR behaviour. + means under ``emit_fps=True``, ``metadata_drop_all_prob=1.0`` produces + a "fps-only metadata segment" rather than "no metadata segment at + all" (the policy's ``has_metadata`` branch sees a non-empty metadata + mask and keeps the ``Metadata: FPS: N, `` block in the prefix). For a + true no-metadata ablation, keep the default ``emit_fps=False``. * - ``val_enable_optional_key_dropout`` - ``False`` - Whether the five drop rolls above also fire on the **validation** diff --git a/src/opentau/configs/default.py b/src/opentau/configs/default.py index 19b723ae..de2e7a87 100644 --- a/src/opentau/configs/default.py +++ b/src/opentau/configs/default.py @@ -234,15 +234,23 @@ class DatasetMixtureConfig: must have a non-empty ``control_mode`` after the optional ``DatasetConfig.control_mode`` override has been applied. Defaults to ``False`` (empty / missing values are allowed). - emit_fps: Whether ``__getitem__`` returns the dataset's native - ``meta.fps`` as the ``fps`` metadata key (``torch.long`` scalar, - paired with ``fps_is_pad=False``). Default ``True``. When - ``False``, neither ``fps`` nor ``fps_is_pad`` appears in the - sample dict and the pi07 / pi07_paligemma policy prefix omits - the ``fps:`` segment. Unlike the other metadata fields, ``fps`` - is **not** rolled by ``metadata_drop_*_prob`` — it's an intrinsic - property of the dataset, not a noisy label, so it is always - present when ``emit_fps=True``. + emit_fps: Whether ``__getitem__`` returns the *effective* + per-sample frame rate (``action_freq`` if set, else the + dataset's native ``meta.fps``) as the ``fps`` metadata key + (``torch.long`` scalar, paired with ``fps_is_pad=False``). + Default ``False`` — fps conditioning is an opt-in feature so + pre-PR checkpoints resume without the policy's metadata + prefix gaining an unfamiliar ``FPS:`` segment. Flip to + ``True`` for new training runs that want per-sample + frame-rate conditioning (especially heterogeneous-frequency + mixtures where ``action_freq=None`` lets each dataset run at + its native rate). Unlike the other metadata fields, ``fps`` + is **not** rolled by ``metadata_drop_*_prob`` — it's an + intrinsic property of the chunk, not a noisy label, so it + is always present (non-pad) for LeRobot samples when + ``emit_fps=True``. VQA samples (no temporal axis) emit + ``fps=0, fps_is_pad=True`` regardless so heterogeneous + VLA + VQA batches stay schema-aligned. tolerance_s: Mixture-wide default tolerance (in seconds) for the load-time ``check_timestamps_sync`` call inside ``LeRobotDataset.__init__``. Each dataset's frame-to-frame @@ -314,11 +322,15 @@ class DatasetMixtureConfig: require_non_empty_robot_type: bool = False require_non_empty_control_mode: bool = False - # Whether `__getitem__` emits the dataset's native fps as the `fps` - # metadata key. Independent of `metadata_drop_*_prob` — fps is intrinsic - # to the dataset, not a noisy label, so it is always present (never - # padded) when this is True. - emit_fps: bool = True + # Whether `__getitem__` emits the effective per-sample fps as the `fps` + # metadata key. Default `False` so pre-PR checkpoints resume cleanly + # (no new `FPS:` segment in the policy's metadata prefix). Flip to + # `True` for new training runs that want per-sample fps conditioning; + # especially relevant for heterogeneous-frequency mixtures + # (`action_freq=None`). Independent of `metadata_drop_*_prob` — fps + # is intrinsic to the chunk, not a noisy label, so it is always + # present (never padded) for LeRobot samples when this is True. + emit_fps: bool = False # Mixture-wide defaults for the load-time timestamp-sync check. Each # dataset can override these via `DatasetConfig.{tolerance_s, diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py index b756f821..b3ee9a32 100644 --- a/src/opentau/datasets/lerobot_dataset.py +++ b/src/opentau/datasets/lerobot_dataset.py @@ -721,7 +721,10 @@ def __init__(self, cfg: TrainPipelineConfig): # is True: the mixture's `action_freq` if set (every dataset has been # resampled to that rate via `resolve_delta_timestamps`), else the # dataset's native `meta.fps`. Independent of the dropout rolls above. - self.emit_fps = dm.emit_fps if dm else True + # Default-False fallback matches `DatasetMixtureConfig.emit_fps` so + # the "no mixture config" path (VQA-only / unit tests) behaves the + # same as the explicit-mixture default. + self.emit_fps = dm.emit_fps if dm else False self._action_freq = dm.action_freq if dm else None # Whether the above drop rolls actually fire. `make_dataset` flips this # off on the validation subset (unless `val_enable_optional_key_dropout` diff --git a/src/opentau/envs/configs.py b/src/opentau/envs/configs.py index 57bb09cd..cb0aef31 100644 --- a/src/opentau/envs/configs.py +++ b/src/opentau/envs/configs.py @@ -79,9 +79,10 @@ class EnvMetadataConfig: emit_fps: Whether to broadcast :attr:`EnvConfig.fps` as the ``fps`` metadata field at inference (paralleling :attr:`DatasetMixtureConfig.emit_fps` at training time). - Defaults to ``True``. When ``False``, the policy prefix omits - the ``fps:`` segment — useful when resuming a checkpoint - trained without fps conditioning. + Defaults to ``False`` — fps conditioning is opt-in so old + checkpoints resume cleanly (no surprise ``FPS:`` segment in + the policy's metadata prefix). Flip to ``True`` for + checkpoints trained with the training-side ``emit_fps=True``. """ speed: int | None = None @@ -89,7 +90,7 @@ class EnvMetadataConfig: mistake: bool | None = None robot_type: str | None = None control_mode: ControlMode | None = None - emit_fps: bool = True + emit_fps: bool = False def __post_init__(self) -> None: # `isinstance(x, bool)` guards exclude Python bools — `bool` is a diff --git a/tests/configs/test_default.py b/tests/configs/test_default.py index ef544129..3f3063b1 100644 --- a/tests/configs/test_default.py +++ b/tests/configs/test_default.py @@ -99,16 +99,20 @@ def test_action_freq_positive_still_valid(): assert cfg.action_freq == 30.0 -def test_emit_fps_defaults_to_true(): - """The new `emit_fps` toggle defaults to True so the policy gets fps conditioning out of the box.""" +def test_emit_fps_defaults_to_false(): + """The new `emit_fps` toggle is opt-in — pre-PR checkpoints resume cleanly + because the policy's metadata prefix doesn't gain an unfamiliar ``FPS:`` + segment by default. New training runs that want per-sample fps + conditioning set `emit_fps=True` explicitly. + """ cfg = DatasetMixtureConfig() - assert cfg.emit_fps is True + assert cfg.emit_fps is False -def test_emit_fps_can_be_disabled(): - """Setting `emit_fps=False` should be accepted (legacy-checkpoint resume escape hatch).""" - cfg = DatasetMixtureConfig(emit_fps=False) - assert cfg.emit_fps is False +def test_emit_fps_can_be_enabled(): + """Setting `emit_fps=True` opts in to per-sample fps tokenization.""" + cfg = DatasetMixtureConfig(emit_fps=True) + assert cfg.emit_fps is True def test_invalid_image_resample_strategy_raises_error(): diff --git a/tests/datasets/test_optional_keys.py b/tests/datasets/test_optional_keys.py index 81747bfa..02ecde46 100644 --- a/tests/datasets/test_optional_keys.py +++ b/tests/datasets/test_optional_keys.py @@ -64,7 +64,7 @@ def __init__( response_drop_prob=0.0, metadata_drop_all_prob=0.0, metadata_drop_each_prob=0.0, - emit_fps=True, + emit_fps=False, action_freq: float | None = None, meta_info: dict | None = None, meta_fps: int = 30, @@ -699,35 +699,44 @@ def test_emitted_as_python_strings_for_default_collate(self): # `fps` is emitted as an intrinsic dataset property — always present (non-pad) -# when `emit_fps=True`, omitted entirely when `emit_fps=False`. Unlike speed / -# quality / mistake / robot_type / control_mode it does NOT participate in any -# of the `metadata_drop_*_prob` dropout rolls. +# when `emit_fps=True`, omitted entirely when `emit_fps=False` (the default). +# Unlike speed / quality / mistake / robot_type / control_mode it does NOT +# participate in any of the `metadata_drop_*_prob` dropout rolls. class TestFpsEmission: def test_fps_emitted_with_meta_fps_value(self): - ds = _DummyBaseDataset(num_cams=1, meta_fps=20) + ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=20) out = ds._to_standard_data_format(_full_raw_item(ds)) assert out["fps"].dtype == torch.long assert out["fps"].item() == 20 assert out["fps_is_pad"].item() is False def test_fps_emitted_as_torch_long_dtype(self): - ds = _DummyBaseDataset(num_cams=1, meta_fps=50) + ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50) out = ds._to_standard_data_format(_full_raw_item(ds)) assert out["fps"].dtype == torch.long def test_fps_omitted_when_emit_fps_false(self): + # The default — pre-PR behaviour preserved. ds = _DummyBaseDataset(num_cams=1, emit_fps=False, meta_fps=20) out = ds._to_standard_data_format(_full_raw_item(ds)) assert "fps" not in out assert "fps_is_pad" not in out + def test_fps_omitted_by_default(self): + """The fixture default mirrors production: ``emit_fps=False`` ⇒ no fps keys.""" + ds = _DummyBaseDataset(num_cams=1, meta_fps=20) + out = ds._to_standard_data_format(_full_raw_item(ds)) + assert "fps" not in out + assert "fps_is_pad" not in out + def test_fps_unaffected_by_metadata_drop_all_prob(self): # fps is intrinsic to the dataset, not a noisy label — it must NOT # participate in `metadata_drop_*_prob` rolls. ds = _DummyBaseDataset( num_cams=1, + emit_fps=True, meta_fps=15, metadata_drop_all_prob=1.0, metadata_drop_each_prob=1.0, @@ -747,6 +756,7 @@ def test_fps_unaffected_by_disabled_dropout(self): # the same emit behavior — fps is gated by `emit_fps`, not by dropout. ds = _DummyBaseDataset( num_cams=1, + emit_fps=True, meta_fps=25, metadata_drop_all_prob=1.0, metadata_drop_each_prob=1.0, @@ -762,8 +772,8 @@ def test_default_collate_stacks_mixed_fps(self): tensor and `fps_is_pad` becomes a (B,) bool tensor.""" from torch.utils.data import default_collate - ds_a = _DummyBaseDataset(num_cams=1, meta_fps=30) - ds_b = _DummyBaseDataset(num_cams=1, meta_fps=50) + ds_a = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=30) + ds_b = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50) item_a = ds_a._to_standard_data_format(_full_raw_item(ds_a)) item_b = ds_b._to_standard_data_format(_full_raw_item(ds_b)) batch = default_collate([item_a, item_b]) @@ -779,7 +789,7 @@ def test_fps_reports_action_freq_when_resampling(self): different timing than the chunk actually carries. """ # Dataset is natively 50 Hz, but the mixture pins everything to 30 Hz. - ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=30.0) + ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50, action_freq=30.0) out = ds._to_standard_data_format(_full_raw_item(ds)) assert out["fps"].item() == 30 assert out["fps_is_pad"].item() is False @@ -787,7 +797,7 @@ def test_fps_reports_action_freq_when_resampling(self): def test_fps_reports_native_when_action_freq_none(self): """When `action_freq is None` (no resampling), the chunk runs at the dataset's native rate — that's what gets emitted.""" - ds = _DummyBaseDataset(num_cams=1, meta_fps=50, action_freq=None) + ds = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=50, action_freq=None) out = ds._to_standard_data_format(_full_raw_item(ds)) assert out["fps"].item() == 50 assert out["fps_is_pad"].item() is False @@ -797,8 +807,9 @@ def test_fps_reports_native_when_action_freq_none(self): # on the base class (and on ``VQADatasetMetadata``, which inherits via ``pass``). # `_emit_optional_keys` must catch that and emit ``fps=0, fps_is_pad=True`` # instead of crashing on ``int(None)`` — otherwise any heterogeneous mixture -# that includes a VQA dataset crashes at the first sample fetch under the -# default ``emit_fps=True``. +# with ``emit_fps=True`` crashes at the first VQA sample fetch. (Under the +# default ``emit_fps=False`` the path is a no-op; these tests opt in to +# exercise the heterogeneous-batch behaviour explicitly.) class TestFpsEmissionVQA: @@ -819,7 +830,7 @@ def test_vqa_metadata_emits_pad_row(self): """ from opentau.datasets.lerobot_dataset import VQADatasetMetadata - ds = _DummyBaseDataset(num_cams=1) + ds = _DummyBaseDataset(num_cams=1, emit_fps=True) ds.meta = VQADatasetMetadata(info={"features": {}}) out = ds._to_standard_data_format(_full_raw_item(ds)) assert "fps" in out and "fps_is_pad" in out @@ -833,7 +844,7 @@ def test_vqa_pad_ignores_action_freq(self): ``action_freq`` would be misleading.""" from opentau.datasets.lerobot_dataset import VQADatasetMetadata - ds = _DummyBaseDataset(num_cams=1, action_freq=30.0) + ds = _DummyBaseDataset(num_cams=1, emit_fps=True, action_freq=30.0) ds.meta = VQADatasetMetadata(info={"features": {}}) out = ds._to_standard_data_format(_full_raw_item(ds)) assert out["fps"].item() == 0 @@ -850,11 +861,11 @@ def test_heterogeneous_vla_vqa_collate(self): from opentau.datasets.lerobot_dataset import VQADatasetMetadata # LeRobot-shaped sample. - ds_vla = _DummyBaseDataset(num_cams=1, meta_fps=30) + ds_vla = _DummyBaseDataset(num_cams=1, emit_fps=True, meta_fps=30) item_vla = ds_vla._to_standard_data_format(_full_raw_item(ds_vla)) # VQA-shaped sample (same dummy class, real VQA metadata). - ds_vqa = _DummyBaseDataset(num_cams=1) + ds_vqa = _DummyBaseDataset(num_cams=1, emit_fps=True) ds_vqa.meta = VQADatasetMetadata(info={"features": {}}) item_vqa = ds_vqa._to_standard_data_format(_full_raw_item(ds_vqa)) diff --git a/tests/envs/test_configs.py b/tests/envs/test_configs.py index f475abd9..a7952e79 100644 --- a/tests/envs/test_configs.py +++ b/tests/envs/test_configs.py @@ -189,17 +189,18 @@ def test_all_none_default_passes(self): assert cfg.robot_type is None assert cfg.control_mode is None # `emit_fps` is a boolean toggle (not a None-presence field) and - # defaults to True — the policy receives an fps segment at eval - # unless the user explicitly opts out. - assert cfg.emit_fps is True - - def test_emit_fps_can_be_disabled(self): - cfg = EnvMetadataConfig(emit_fps=False) + # defaults to False — fps conditioning is opt-in so pre-PR + # checkpoints resume cleanly without an unfamiliar ``FPS:`` + # segment in the metadata prefix. assert cfg.emit_fps is False + def test_emit_fps_can_be_enabled(self): + cfg = EnvMetadataConfig(emit_fps=True) + assert cfg.emit_fps is True + def test_emit_fps_kept_orthogonal_to_other_fields(self): """Setting ``emit_fps`` must not perturb the other fields' defaults.""" - cfg = EnvMetadataConfig(emit_fps=False) + cfg = EnvMetadataConfig(emit_fps=True) assert cfg.speed is None assert cfg.quality is None assert cfg.mistake is None diff --git a/tests/envs/test_utils.py b/tests/envs/test_utils.py index ebb865a8..0bca2eb5 100644 --- a/tests/envs/test_utils.py +++ b/tests/envs/test_utils.py @@ -232,31 +232,31 @@ class TestAddEvalMetadata: silently changes the prefix the model sees at eval. """ - def test_all_none_default_skips_user_keys_but_still_emits_fps(self): + def test_all_none_default_skips_every_key(self): """``EnvMetadataConfig`` defaults: every user-controlled field is - ``None`` *but* ``emit_fps`` defaults to ``True``, so the helper still - injects ``fps`` / ``fps_is_pad`` from ``cfg.env.fps``. + ``None`` and ``emit_fps`` defaults to ``False``, so the helper + injects **no** metadata keys at all. This preserves pre-PR + behaviour for any eval config that doesn't opt in. """ obs = _make_obs() original_keys = set(obs.keys()) out = add_eval_metadata(obs, cfg=_make_cfg()) assert out is obs # mutated-in-place contract - new_keys = set(out.keys()) - original_keys - assert new_keys == {"fps", "fps_is_pad"}, ( - f"unexpected injected keys: {new_keys}; only fps/fps_is_pad should appear " - f"under the all-None metadata default" + assert set(out.keys()) == original_keys, ( + f"unexpected injected keys under all-None metadata default: {set(out.keys()) - original_keys}" ) - def test_emit_fps_false_skips_fps(self): - """``emit_fps=False`` on ``EnvMetadataConfig`` reverts the helper to the - legacy "skip all when nothing set" behaviour — useful when resuming a - checkpoint trained without fps conditioning. + def test_emit_fps_true_injects_fps(self): + """``emit_fps=True`` opts in to broadcasting ``cfg.env.fps`` as a + ``(B,)`` torch.long tensor with ``fps_is_pad=False``. """ obs = _make_obs() original_keys = set(obs.keys()) - out = add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False)) - assert set(out.keys()) == original_keys, ( - "no keys should be injected when emit_fps=False and every metadata field is None" + out = add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True)) + new_keys = set(out.keys()) - original_keys + assert new_keys == {"fps", "fps_is_pad"}, ( + f"unexpected injected keys: {new_keys}; emit_fps=True with no other " + f"metadata fields should inject only fps/fps_is_pad" ) @pytest.mark.parametrize("batch_size", [1, 4]) @@ -317,10 +317,10 @@ def test_string_fields_broadcast_as_list(self, key, value): assert f"{key}_is_pad" not in obs, "string fields use empty-string as pad signal, not a flag" def test_partial_fields_only_inject_specified_keys(self): - # `emit_fps=False` so this test isolates the user-set fields from the - # default fps injection (covered separately in test_emit_fps_*). + # `emit_fps` defaults to False, so this exercises the user-set fields + # in isolation from the fps emission path (covered separately). obs = _make_obs() - add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5")) + add_eval_metadata(obs, cfg=_make_cfg(speed=30, robot_type="UR5")) assert "speed" in obs and "speed_is_pad" in obs assert "robot_type" in obs @@ -340,26 +340,25 @@ def test_device_propagation(self): Using "meta" device makes this CPU-host portable (no real CUDA required) while still exercising the device-routing branch in the - helper. + helper. Opts in to ``emit_fps=True`` so the fps device-routing + branch is also exercised. """ obs = _make_obs(batch_size=2, device="meta") - add_eval_metadata(obs, cfg=_make_cfg(speed=20, mistake=True)) + add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True, speed=20, mistake=True)) assert obs["speed"].device.type == "meta" assert obs["speed_is_pad"].device.type == "meta" assert obs["mistake"].device.type == "meta" assert obs["mistake_is_pad"].device.type == "meta" - # fps gets injected too under the default `emit_fps=True`; it must - # share the same device as state. assert obs["fps"].device.type == "meta" assert obs["fps_is_pad"].device.type == "meta" @pytest.mark.parametrize("batch_size", [1, 4]) def test_fps_injects_long_tensor_from_env_fps(self, batch_size): - """``emit_fps=True`` (default) → broadcast ``cfg.env.fps`` as a + """``emit_fps=True`` opt-in → broadcast ``cfg.env.fps`` as a ``(B,)`` torch.long tensor with ``fps_is_pad=False`` rows.""" obs = _make_obs(batch_size=batch_size) - add_eval_metadata(obs, cfg=_make_cfg(fps=20)) # LiberoEnv-style fps + add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True, fps=20)) # LiberoEnv-style fps assert obs["fps"].dtype == torch.long assert obs["fps"].shape == (batch_size,) @@ -373,13 +372,14 @@ def test_fps_value_comes_from_env_not_metadata(self): """The fps value source is ``cfg.env.fps`` (env stepping freq), NOT a field on ``EnvMetadataConfig`` (which only carries the on/off toggle).""" obs = _make_obs(batch_size=2) - add_eval_metadata(obs, cfg=_make_cfg(fps=50)) + add_eval_metadata(obs, cfg=_make_cfg(emit_fps=True, fps=50)) assert torch.equal(obs["fps"], torch.full((2,), 50, dtype=torch.long)) def test_fps_disabled_skips_only_fps_keys(self): - """``emit_fps=False`` skips the fps injection but leaves other set - fields alone — they go through the regular None-presence gating.""" + """``emit_fps=False`` (the default) skips the fps injection but leaves + other set fields alone — they go through the regular None-presence + gating. The explicit ``emit_fps=False`` documents intent.""" obs = _make_obs(batch_size=2) add_eval_metadata(obs, cfg=_make_cfg(emit_fps=False, speed=30, robot_type="UR5"))