From 1adeb4bb10a2b1b1e06a3f2888ab9d1b3f4944bb Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 6 May 2026 11:40:43 -0700 Subject: [PATCH 1/4] chore(datasets): gate skip-timestamp-check warning to rank 0 LeRobotDataset is constructed once per rank, so the existing ``logging.warning`` at the skip-timestamp-check branch fires ``num_processes`` x ``num_datasets`` times. For a 392-dataset pretraining mixture on 8 GPUs that's ~3K identical log lines between accelerate init and the first training step, drowning out everything else. Reuse the existing ``get_proc_accelerator`` pattern (already used elsewhere in this file at L352 and L1287) to gate the warning to ``acc.is_main_process``, with a fall-through to logging when no Accelerator is set so single-process dev/test runs are unchanged. Behavior in practice: - 8-rank distributed run: 1x per dataset (was 8x) - single-process / CPU dev: unchanged - the underlying choice the user opted into (skip_timestamp_check on a heterogeneous mixture) and its safety implication are both unchanged --- src/opentau/datasets/lerobot_dataset.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py index d4453d52..93d1330f 100644 --- a/src/opentau/datasets/lerobot_dataset.py +++ b/src/opentau/datasets/lerobot_dataset.py @@ -1346,12 +1346,20 @@ def __init__( # Check timestamps # If transform is set, with_transform will decode all columns of a row before returning the desired column(s). if self.skip_timestamp_check: - logging.warning( - "Skipping timestamp sync check for %s (skip_timestamp_check=True). " - "Frame-to-frame spacing is NOT verified against 1/fps; downstream " - "delta_timestamps lookups may sample unintended frames.", - self.repo_id, - ) + # LeRobotDataset is constructed once per rank, so a naive + # ``logging.warning`` floods the run log with ``num_processes`` × + # ``num_datasets`` copies of the same message (392 × 8 ≈ 3K lines + # for a wide pretraining mixture). Gate to rank 0 — fall through + # to logging when no Accelerator is set (single-process dev / + # tests). + acc = get_proc_accelerator() + if acc is None or acc.is_main_process: + logging.warning( + "Skipping timestamp sync check for %s (skip_timestamp_check=True). " + "Frame-to-frame spacing is NOT verified against 1/fps; downstream " + "delta_timestamps lookups may sample unintended frames.", + self.repo_id, + ) else: no_transform_ds = self.hf_dataset.with_transform(None).with_format("numpy") logging.info("Checking timestamps synchronization...") From 3a36dce71dd30bb985044c9b534c619ec0ae6bf0 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 6 May 2026 11:48:24 -0700 Subject: [PATCH 2/4] chore(datasets): collapse skip-timestamp-check warning to once-per-process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit skip_timestamp_check is a mixture-wide decision and the warning text is identical for every dataset, so the previous "rank-0 only" gate still left ~392 lines per run for a wide pretraining mixture. Add a module-level _SKIP_TIMESTAMP_WARNED flag (mirroring the existing _CONTROL_MODE_WARNED dedup pattern) so the warning fires once per process, on the main rank only — once per run total instead of num_processes × num_datasets. The warning text now references the *first* dataset hit as an example rather than the only one, since the flag is mixture-wide; users who want a per-dataset listing can grep the resolved config. --- src/opentau/datasets/lerobot_dataset.py | 30 +++++++++++++++++-------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/opentau/datasets/lerobot_dataset.py b/src/opentau/datasets/lerobot_dataset.py index 93d1330f..a117fe2d 100644 --- a/src/opentau/datasets/lerobot_dataset.py +++ b/src/opentau/datasets/lerobot_dataset.py @@ -216,6 +216,14 @@ def wrapped(self, idx): # instances within a single process (e.g., train + val constructed for the same repo). _CONTROL_MODE_WARNED: set[str] = set() +# ``skip_timestamp_check`` is a mixture-wide decision (with optional per-dataset +# override) that produces an identical warning for every dataset in the mixture. +# For a 392-dataset pretraining run on 8 ranks, the naive per-dataset emission +# floods the run log with ~3K identical lines. This flag makes the warning fire +# once per process; combined with the rank-0 gate at the call site, that's once +# per run rather than 8 × num_datasets. +_SKIP_TIMESTAMP_WARNED: bool = False + def suppress_control_mode_warning(repo_id: str) -> None: """Mark ``repo_id`` as already-warned so the missing-``control_mode`` warning @@ -1346,17 +1354,21 @@ def __init__( # Check timestamps # If transform is set, with_transform will decode all columns of a row before returning the desired column(s). if self.skip_timestamp_check: - # LeRobotDataset is constructed once per rank, so a naive - # ``logging.warning`` floods the run log with ``num_processes`` × - # ``num_datasets`` copies of the same message (392 × 8 ≈ 3K lines - # for a wide pretraining mixture). Gate to rank 0 — fall through - # to logging when no Accelerator is set (single-process dev / - # tests). + # ``skip_timestamp_check`` is a mixture-wide decision and the + # message is identical for every dataset, so emit it once per + # process and only on the main rank. Naive per-dataset / per-rank + # logging floods the run log with ``num_processes`` × + # ``num_datasets`` copies of the same line (392 × 8 ≈ 3K for a + # wide pretraining mixture). Falls through to logging when no + # Accelerator is set (single-process dev / tests). + global _SKIP_TIMESTAMP_WARNED acc = get_proc_accelerator() - if acc is None or acc.is_main_process: + if not _SKIP_TIMESTAMP_WARNED and (acc is None or acc.is_main_process): + _SKIP_TIMESTAMP_WARNED = True logging.warning( - "Skipping timestamp sync check for %s (skip_timestamp_check=True). " - "Frame-to-frame spacing is NOT verified against 1/fps; downstream " + "skip_timestamp_check=True is in effect for one or more " + "datasets in this mixture (e.g. %s). Frame-to-frame " + "spacing is NOT verified against 1/fps; downstream " "delta_timestamps lookups may sample unintended frames.", self.repo_id, ) From 2a13f32d77c7ce4d1a4137b5287fd84c12362937 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 22:28:35 +0000 Subject: [PATCH 3/4] [claude-fix] address review feedback on #278 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - addresses @shuheng-liu (missing test): added test_skip_timestamp_warning_emitted_once_per_process to tests/datasets/test_datasets.py mirroring the _CONTROL_MODE_WARNED warn-once test — constructs two datasets with skip_timestamp_check=True and asserts exactly one matching log record via caplog, with an upfront ld_mod._SKIP_TIMESTAMP_WARNED = False reset to make the test independent of prior state. tests: passed — pytest -xvs tests/datasets/test_datasets.py::test_skip_timestamp_warning_emitted_once_per_process tests/datasets/test_datasets.py::test_control_mode_warning_emitted_once_per_repo Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/datasets/test_datasets.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index f79bd3ff..4c2a6ea0 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -1027,6 +1027,32 @@ def test_control_mode_warning_emitted_once_per_repo(tmp_path, lerobot_dataset_fa assert len(matching) == 1 +def test_skip_timestamp_warning_emitted_once_per_process(tmp_path, lerobot_dataset_factory, caplog): + """`skip_timestamp_check=True` warns exactly once per process across multiple + LeRobotDataset instances — locks in the `_SKIP_TIMESTAMP_WARNED` dedup so a + heterogeneous mixture of N datasets emits 1 line, not N.""" + from opentau.datasets import lerobot_dataset as ld_mod + + ld_mod._SKIP_TIMESTAMP_WARNED = False + + with caplog.at_level(logging.WARNING): + ds_a = lerobot_dataset_factory( + root=tmp_path / "skip_warn_a", + repo_id="warn-once/skip-ts-a", + skip_timestamp_check=True, + ) + ds_b = lerobot_dataset_factory( + root=tmp_path / "skip_warn_b", + repo_id="warn-once/skip-ts-b", + skip_timestamp_check=True, + ) + + assert ds_a.skip_timestamp_check is True + assert ds_b.skip_timestamp_check is True + matching = [r for r in caplog.records if "skip_timestamp_check=True" in r.getMessage()] + assert len(matching) == 1 + + def test_robot_type_and_control_mode_in_meta_info(tmp_path, lerobot_dataset_factory, info_factory): """robot_type and control_mode are surfaced as optional fields in the standard data format. Verify the underlying meta/info.json values that From 043c96820121d2b9e30a37f8122b2cb9fb772c96 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Thu, 7 May 2026 03:34:44 +0000 Subject: [PATCH 4/4] [claude-fix] address review feedback on #278 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - addresses @shuheng-liu (nit): wrapped test_skip_timestamp_warning_emitted_once_per_process body in try/finally so the module-level _SKIP_TIMESTAMP_WARNED flag is restored to its original value after the test runs, even on failure. Prevents cross-test bleed-through within the same pytest-xdist worker if a future test wants to assert the warning fires again. tests: passed — pytest -m "not gpu" -n auto tests/datasets/test_datasets.py (2 pre-existing failures in test_do_not_use_imagenet_stats unrelated to this change — HF cache misses for lerobot/droid_100 and lerobot/aloha_mobile_cabinet parquet files) Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/datasets/test_datasets.py | 35 ++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 4c2a6ea0..bb987ad3 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -1033,24 +1033,27 @@ def test_skip_timestamp_warning_emitted_once_per_process(tmp_path, lerobot_datas heterogeneous mixture of N datasets emits 1 line, not N.""" from opentau.datasets import lerobot_dataset as ld_mod + original = ld_mod._SKIP_TIMESTAMP_WARNED ld_mod._SKIP_TIMESTAMP_WARNED = False + try: + with caplog.at_level(logging.WARNING): + ds_a = lerobot_dataset_factory( + root=tmp_path / "skip_warn_a", + repo_id="warn-once/skip-ts-a", + skip_timestamp_check=True, + ) + ds_b = lerobot_dataset_factory( + root=tmp_path / "skip_warn_b", + repo_id="warn-once/skip-ts-b", + skip_timestamp_check=True, + ) - with caplog.at_level(logging.WARNING): - ds_a = lerobot_dataset_factory( - root=tmp_path / "skip_warn_a", - repo_id="warn-once/skip-ts-a", - skip_timestamp_check=True, - ) - ds_b = lerobot_dataset_factory( - root=tmp_path / "skip_warn_b", - repo_id="warn-once/skip-ts-b", - skip_timestamp_check=True, - ) - - assert ds_a.skip_timestamp_check is True - assert ds_b.skip_timestamp_check is True - matching = [r for r in caplog.records if "skip_timestamp_check=True" in r.getMessage()] - assert len(matching) == 1 + assert ds_a.skip_timestamp_check is True + assert ds_b.skip_timestamp_check is True + matching = [r for r in caplog.records if "skip_timestamp_check=True" in r.getMessage()] + assert len(matching) == 1 + finally: + ld_mod._SKIP_TIMESTAMP_WARNED = original def test_robot_type_and_control_mode_in_meta_info(tmp_path, lerobot_dataset_factory, info_factory):