From c9ac78e4ff200be3a585e802a8e242efad9b1bd9 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 1 May 2026 22:49:13 -0700 Subject: [PATCH 1/8] fix(pi07): gate optional prefix tokens in low- and high-level planners MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backports the #226-equivalent prefix-gating fixes from pi07_paligemma (commit b7d3a82) to pi07's low- and high-level planners, plus updates the affected CPU regression test to match the new layout. Low-level (modeling_pi07_low_level.py::embed_prefix) had three remaining divergences in the all-optionals-dropped case: - state-end was unconditionally ", " (2 tokens) instead of pi05's ":\n". - ";\n " prefix-end was unconditionally appended even with no optional content to terminate, dangling 2 spurious tokens after the state-end. - "Action: " indicator att_masks were [1] + [0]*(N-1), making the indicator a single bidirectional block; pi05 uses [1]*N (one causal block per token), which shifts the cumsum at the indicator -> first- discrete boundary and changes the discrete-action CE loss. Now compute has_any_optional once at the top of embed_prefix and gate the state-end string + the prefix-end block on it. Indicator att_masks switched to [1]*N to match pi05. High-level (modeling_pi07_high_level.py::embed_prefix) had one of the three: ";\n " was emitted unconditionally as if a free-floating block, but it is the metadata -> "Updated Memory:" separator. With no metadata there is nothing to terminate. Wrapped under `if metadata_tokens is not None:`. The "Updated Memory: " AR anchor itself stays unconditional — inference relies on it (memory_tokens is None at inference by design). The existing CPU regression test `test_all_optional_blocks_absent_skips _emission` had the OLD (buggy) layout baked into its assertions; updated to match the new gated layout (11 tokens total, ":\n" state-end, no trailing prefix-end). A byte-identity test against pi06 (analogous to pi07_paligemma vs pi05 in #226) is deliberately deferred to a follow-up issue: pi07_low_level uses continuous state with separate State:/Action: indicators while pi06 inlines discretized state in the language prompt, and pi07_high_level uses a different prompt template, so byte-equivalence isn't feasible without aligning pi06's prompts (or building pi06_mem). --- .../modeling_pi07_high_level.py | 40 +++++++---- .../modeling_pi07_low_level.py | 70 ++++++++++++++----- tests/policies/test_pi07_cpu.py | 18 +++-- 3 files changed, 90 insertions(+), 38 deletions(-) diff --git a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py index 2b8f372f..2af13448 100644 --- a/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py +++ b/src/opentau/policies/pi07/high_level_planner/modeling_pi07_high_level.py @@ -888,9 +888,16 @@ def embed_prefix( Gemma 3 embedding layer. **Concatenation order** (training when memory and response are provided): - ``[images | language | metadata | ";\\n " | "Updated Memory: " | memory_tokens | + ``[images | language | metadata? | ";\\n "? | "Updated Memory: " | memory_tokens | "Subtask: " | response_tokens]`` + ``";\\n "`` is gated on ``metadata_tokens`` — it serves as the metadata → + ``"Updated Memory:"`` separator, so when no metadata is provided there is + nothing to terminate and emitting it would dangle spurious tokens. The + ``"Updated Memory: "`` anchor itself is unconditional because inference + relies on it as the autoregressive starting point for memory decoding + (memory_tokens is None at inference by design). + When ``memory_tokens`` / ``response_tokens`` are omitted (inference), only the fixed spans before those segments are present; memory and subtask text are filled in via KV-cache decoding plus an explicit ``"Subtask: "`` injection before response AR. @@ -898,7 +905,7 @@ def embed_prefix( Attention pattern (via ``att_masks`` cumsums): - Image + language tokens: bidirectional (``0``). - Metadata (if present): new bidirectional block (``[1, 0, …, 0]``). - - ``";\\n "`` (same string as ``encode(";\n ", add_special_tokens=False)``): continues previous block (``0``). + - ``";\\n "`` (only when metadata present): continues previous block (``0``). - ``"Updated Memory: "``: new bidirectional block (``[1, 0, …, 0]``). - Memory token slots: causal segment (``1`` per slot). - ``"Subtask: "`` (training): new block then causal continuation within span. @@ -973,20 +980,25 @@ def embed_prefix( pad_masks.append(metadata_masks) att_masks += [1] + [0] * (metadata_emb.shape[1] - 1) - prefix_end_indicator_ids = self.language_tokenizer.encode(";\n ", add_special_tokens=False) - prefix_end_tokens = torch.tensor( - [prefix_end_indicator_ids] * bsize, - device=lang_tokens.device, - dtype=torch.long, - ) - prefix_end_emb = self.gemma3_with_expert.embed_language_tokens(prefix_end_tokens) + # ";\n " is the metadata -> "Updated Memory:" separator. With no metadata, + # there is nothing to terminate, so omit it; "Updated Memory:" still anchors + # AR memory decoding either way. + prefix_end_indicator_ids = self.language_tokenizer.encode(";\n ", add_special_tokens=False) + prefix_end_tokens = torch.tensor( + [prefix_end_indicator_ids] * bsize, + device=lang_tokens.device, + dtype=torch.long, + ) + prefix_end_emb = self.gemma3_with_expert.embed_language_tokens(prefix_end_tokens) - num_prefix_end_embs = prefix_end_emb.shape[1] - prefix_end_mask = torch.ones(bsize, num_prefix_end_embs, dtype=torch.bool, device=lang_tokens.device) + num_prefix_end_embs = prefix_end_emb.shape[1] + prefix_end_mask = torch.ones( + bsize, num_prefix_end_embs, dtype=torch.bool, device=lang_tokens.device + ) - embs.append(prefix_end_emb) - pad_masks.append(prefix_end_mask) - att_masks += [0] * num_prefix_end_embs + embs.append(prefix_end_emb) + pad_masks.append(prefix_end_mask) + att_masks += [0] * num_prefix_end_embs memory_start_indicator_ids = self.language_tokenizer.encode( "Updated Memory: ", add_special_tokens=False diff --git a/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py b/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py index 01dfeb41..4c53ba31 100644 --- a/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py +++ b/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py @@ -1152,17 +1152,24 @@ def embed_prefix( Concatenation order: - ``[videos | language | State: | state(T) | ", " | response | - Subgoal: | subgoal_images… | ", " | metadata | ";\\n " | + ``[videos | language | State: | state(T) | | response? | + Subgoal: | subgoal_images… | ", " | metadata? | ";\\n "? | ("Action:" + discrete_actions only when training)]`` + ```` is ``", "`` when at least one optional middle block + (response / subgoal / metadata) contributes real tokens, else ``":\\n"``. + The trailing ``";\\n "`` prefix-end is only emitted in the former case; + without optional content the state-end already serves as the separator + before ``"Action: "``, so appending another would dangle spurious tokens. + Attention pattern (via ``att_masks`` cumsums): - Video + language: bidirectional (``0``). - - ``State:``, projected state timestep tokens, comma after state: bidirectional (``0``). + - ``State:``, projected state timestep tokens, state-end separator: bidirectional (``0``). - Response spans: prefix-LM style block opening (``[1, 0, …]`` inside the segment). - ``Subgoal:``: new bidirectional block (``[1, 0, …]``). - Subgoal image patches per camera: bidirectional blocks (``[1, 0, …]``). - Commas/metadata / ``";\\n "``: mostly continued prefix blocks (see code). + - ``Action:`` indicator: each token is its own causal block (``[1, 1, …]``). - Discrete actions (training): causal ``1`` per timestep after ``Action:``. Args: @@ -1196,6 +1203,22 @@ def embed_prefix( att_masks = [] bsize = lang_tokens.shape[0] + # Whether any optional middle block (response / subgoal / metadata) will + # actually contribute real tokens to the prefix. When all are dropped the + # state-end separator collapses to ":\n" and the trailing prefix-end is + # omitted, eliminating spurious dangling tokens that would otherwise break + # the cumsum at the indicator -> first-discrete boundary. + has_response = ( + response_tokens is not None and response_masks is not None and bool(response_masks.any()) + ) + has_subgoal = ( + bool(subgoal_images) and bool(subgoal_img_masks) and any(m.any() for m in subgoal_img_masks) + ) + has_metadata = ( + metadata_tokens is not None and metadata_masks is not None and bool(metadata_masks.any()) + ) + has_any_optional = bool(has_response or has_subgoal or has_metadata) + for vid, vid_mask in zip(videos, vid_masks, strict=True): vid_emb = self.embed_video(vid) # (B, num_video_tokens, vlm_hidden) vid_emb = vid_emb.to(dtype=_preferred_dtype()) @@ -1252,7 +1275,10 @@ def embed_prefix( pad_masks.append(state_mask) att_masks += [0] * num_state_tokens # full attention with video and language - state_end_indicator_ids = self.language_tokenizer.encode(", ", add_special_tokens=False) + # When optional middle blocks follow, use ", " as a state -> optional separator; + # otherwise collapse to ":\n" so the trailing prefix-end can be omitted. + state_end_str = ", " if has_any_optional else ":\n" + state_end_indicator_ids = self.language_tokenizer.encode(state_end_str, add_special_tokens=False) state_end_tokens = torch.tensor( [state_end_indicator_ids] * bsize, device=lang_tokens.device, @@ -1352,20 +1378,26 @@ def embed_prefix( pad_masks.append(metadata_masks) att_masks += [1] + [0] * (metadata_emb.shape[1] - 1) - prefix_end_indicator_ids = self.language_tokenizer.encode(";\n ", add_special_tokens=False) - prefix_end_tokens = torch.tensor( - [prefix_end_indicator_ids] * bsize, - device=lang_tokens.device, - dtype=torch.long, - ) - prefix_end_emb = self.gemma3_with_expert.embed_language_tokens(prefix_end_tokens) + # Only emit the ";\n " prefix-end when at least one optional middle block was added + # above. With no optional content, the state-end already collapsed to ":\n" and acts + # as the separator before "Action: " — appending another would dangle spurious tokens. + if has_any_optional: + prefix_end_indicator_ids = self.language_tokenizer.encode(";\n ", add_special_tokens=False) + prefix_end_tokens = torch.tensor( + [prefix_end_indicator_ids] * bsize, + device=lang_tokens.device, + dtype=torch.long, + ) + prefix_end_emb = self.gemma3_with_expert.embed_language_tokens(prefix_end_tokens) - num_prefix_end_embs = prefix_end_emb.shape[1] - prefix_end_mask = torch.ones(bsize, num_prefix_end_embs, dtype=torch.bool, device=lang_tokens.device) + num_prefix_end_embs = prefix_end_emb.shape[1] + prefix_end_mask = torch.ones( + bsize, num_prefix_end_embs, dtype=torch.bool, device=lang_tokens.device + ) - embs.append(prefix_end_emb) - pad_masks.append(prefix_end_mask) - att_masks += [0] * num_prefix_end_embs + embs.append(prefix_end_emb) + pad_masks.append(prefix_end_mask) + att_masks += [0] * num_prefix_end_embs if discrete_actions is not None: discrete_action_start_indicator_ids = self.language_tokenizer.encode( @@ -1387,7 +1419,11 @@ def embed_prefix( embs.append(discrete_action_start_emb) pad_masks.append(discrete_action_start_mask) - att_masks += [1] + [0] * (num_discrete_action_start_embs - 1) + # Each "Action: " indicator token is its own causal block. Using + # [1] + [0]*(N-1) collapses them into a single bidirectional block, which + # shifts the cumsum at the indicator -> first-discrete boundary by N-1 + # and breaks the discrete-action CE loss. + att_masks += [1] * num_discrete_action_start_embs discrete_action_emb = self.gemma3_with_expert.embed_discrete_actions(discrete_actions) embs.append(discrete_action_emb.to(dtype=_preferred_dtype())) diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py index 85dea0eb..c65581ed 100644 --- a/tests/policies/test_pi07_cpu.py +++ b/tests/policies/test_pi07_cpu.py @@ -634,7 +634,12 @@ class TestEmbedPrefixConditionalGuards: def test_all_optional_blocks_absent_skips_emission(self): """All-False response_masks + no subgoal_images + all-False metadata_masks → - the prefix collapses to ``videos + lang + State: + state + ", " + ";\\n "``. + the prefix collapses to ``videos + lang + State: + state + ":\\n"``. + + With ``has_any_optional == False`` the state-end separator collapses + from ``", "`` to ``":\\n"`` (same fake-tokenizer length: 2 tokens) and + the trailing ``";\\n "`` prefix-end is omitted entirely so it cannot + dangle as a spurious separator before ``"Action: "``. """ method = _embed_prefix_method() fake = _make_fake_flow_matching() @@ -658,12 +663,11 @@ def test_all_optional_blocks_absent_skips_emission(self): # lang: 3 (prompt_len) # "State: ": 2 # state(T=1): 1 - # ", ": 2 - # ";\n ": 2 - # Total = 13 - assert embs.shape == (bsize, 13, 4) - assert pad_masks.shape == (bsize, 13) - assert att_masks.shape == (bsize, 13) + # ":\n": 2 (state-end; collapsed from ", " because no optionals) + # Total = 11 (";\n " prefix-end is omitted with no optional content) + assert embs.shape == (bsize, 11, 4) + assert pad_masks.shape == (bsize, 11) + assert att_masks.shape == (bsize, 11) # No causal boundaries (1's) anywhere in the att_masks — every block # should remain bidirectional with all-skip optional blocks. assert int(att_masks[0].sum().item()) == 0, ( From ef68a93771161291acff90dd2ef250cbb8626785 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 29 Apr 2026 14:22:16 -0700 Subject: [PATCH 2/8] feat(ci): pin Claude workflows to Opus 4.7 1M, allow test-skip on doc fixes (#208) (cherry picked from commit b9b3841b0515e7eb866d1718d9d5e7eeda3acb43) --- .github/workflows/claude-implement-fixes.yml | 22 ++++++++++++++++++-- .github/workflows/claude-pr-review.yml | 2 +- .github/workflows/extract-claude-lessons.yml | 1 + 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.github/workflows/claude-implement-fixes.yml b/.github/workflows/claude-implement-fixes.yml index f88d9231..e3e97bc9 100644 --- a/.github/workflows/claude-implement-fixes.yml +++ b/.github/workflows/claude-implement-fixes.yml @@ -100,6 +100,7 @@ jobs: with: anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} claude_args: | + --model ${{ vars.CLAUDE_MODEL || 'claude-opus-4-7[1m]' }} --permission-mode bypassPermissions prompt: | A reviewer asked you to address review feedback on this PR. @@ -111,16 +112,33 @@ jobs: 2. For each actionable comment containing `@claude fix`, implement the fix on the PR's branch. 3. Skip comments that are questions, taste preferences, or already addressed. - 4. Run the test command from CLAUDE.md before pushing. + 4. Decide whether to run tests: + - If the diff is purely documentation, comments, formatting, + string-literal text, or otherwise CANNOT change runtime behavior, + you MAY skip tests. Be honest about confidence — when in doubt, run. + If you skip, the commit body MUST contain a line of the form: + tests: skipped — + - Otherwise (changes touching imports, function bodies, control + flow, types, configs read at runtime, dependencies, or build + manifests): run `pytest -m "not gpu" -n auto`. Scope to the + changed subtree where possible (e.g. `pytest tests/policies/test_pi05.py` + for a pi05 change) to keep the run fast. The commit body MUST contain: + tests: passed — + If tests fail: do NOT push; reply on the relevant PR comment + explaining the failure and stop. 5. Make ONE commit at the end of the session that addresses every comment you decided to act on — do NOT push one commit per comment. Subject line (must be < 80 chars per CLAUDE.md): [claude-fix] address review feedback on #${{ github.event.issue.number || github.event.pull_request.number }} - Commit body: a bulleted list, one bullet per addressed comment: + Commit body: a bulleted list, one bullet per addressed comment, + followed by the `tests:` line from step 4: - addresses @ (): + ... + tests: passed — pytest -m "not gpu" tests/policies/test_pi05.py + (or: tests: skipped — comment-only change, no runtime impact) Then push the single commit to the PR branch. 6. Reply individually to each addressed comment on the PR with diff --git a/.github/workflows/claude-pr-review.yml b/.github/workflows/claude-pr-review.yml index def19ebe..68ef60c8 100644 --- a/.github/workflows/claude-pr-review.yml +++ b/.github/workflows/claude-pr-review.yml @@ -50,7 +50,7 @@ jobs: with: anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} claude_args: | - --model claude-opus-4-7 + --model ${{ vars.CLAUDE_MODEL || 'claude-opus-4-7[1m]' }} --permission-mode bypassPermissions prompt: | You are reviewing PR #${{ github.event.pull_request.number }} in diff --git a/.github/workflows/extract-claude-lessons.yml b/.github/workflows/extract-claude-lessons.yml index 5826321e..98a1da30 100644 --- a/.github/workflows/extract-claude-lessons.yml +++ b/.github/workflows/extract-claude-lessons.yml @@ -39,6 +39,7 @@ jobs: with: anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} claude_args: | + --model ${{ vars.CLAUDE_MODEL || 'claude-opus-4-7[1m]' }} --permission-mode bypassPermissions prompt: | Review the comments on this merged PR. If any reviewer feedback From 844c39ec7f37fd94ae509016898bc7a99baad582 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 29 Apr 2026 16:08:07 -0700 Subject: [PATCH 3/8] fix(ci): gate extract-claude-lessons on head branch, not PR author (#212) (cherry picked from commit 35629df59927984706ed7569c932eaf26158683f) --- .github/workflows/extract-claude-lessons.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/extract-claude-lessons.yml b/.github/workflows/extract-claude-lessons.yml index 98a1da30..e0a7cbf0 100644 --- a/.github/workflows/extract-claude-lessons.yml +++ b/.github/workflows/extract-claude-lessons.yml @@ -25,9 +25,13 @@ permissions: jobs: extract-lessons: + # Gate on the head branch name, not user.login: in this repo Claude Code + # pushes to a `claude/*` branch and a human opens the PR, so the PR's + # `user.login` never contains 'claude'. The branch prefix is the reliable + # signal that Claude touched the PR. if: >- github.event.pull_request.merged == true - && contains(github.event.pull_request.user.login, 'claude') + && startsWith(github.event.pull_request.head.ref, 'claude/') && !startsWith(github.event.pull_request.title, 'chore(claude): learn from') runs-on: ubuntu-latest timeout-minutes: 20 From dda4446fa13b0b16f6fc68f60db40f7d66aec674 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 2 May 2026 06:13:44 +0000 Subject: [PATCH 4/8] [claude-fix] address review feedback on #229 - addresses @claude (LL embed_prefix discrete_actions test): added test_discrete_actions_indicator_uses_per_token_causal_blocks pinning the [1]*N att_masks pattern on the "Action: " indicator + discrete action span, so a regression to [1]+[0]*(N-1) fails immediately. - addresses @claude (has_subgoal style nit): wrapped m.any() in bool(...) inside the any(...) generator so has_subgoal materializes Python bools matching has_response/has_metadata. tests: passed - pytest -m "not gpu" -n auto tests/policies/ Co-Authored-By: Claude Opus 4.7 (1M context) --- .../modeling_pi07_low_level.py | 2 +- tests/policies/test_pi07_cpu.py | 54 +++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py b/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py index 4c53ba31..ae18a54e 100644 --- a/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py +++ b/src/opentau/policies/pi07/low_level_planner/modeling_pi07_low_level.py @@ -1212,7 +1212,7 @@ def embed_prefix( response_tokens is not None and response_masks is not None and bool(response_masks.any()) ) has_subgoal = ( - bool(subgoal_images) and bool(subgoal_img_masks) and any(m.any() for m in subgoal_img_masks) + bool(subgoal_images) and bool(subgoal_img_masks) and any(bool(m.any()) for m in subgoal_img_masks) ) has_metadata = ( metadata_tokens is not None and metadata_masks is not None and bool(metadata_masks.any()) diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py index c65581ed..35b49df9 100644 --- a/tests/policies/test_pi07_cpu.py +++ b/tests/policies/test_pi07_cpu.py @@ -748,3 +748,57 @@ def test_response_mask_any_true_emits_block(self): assert per_sample_sum == 1, ( f"expected exactly one causal boundary from the response block opening, got {per_sample_sum}" ) + + def test_discrete_actions_indicator_uses_per_token_causal_blocks(self): + """The ``"Action: "`` indicator must use ``[1]*N`` (one causal block per + token), not ``[1] + [0]*(N-1)`` (single bidirectional block). + + The buggy ``[1] + [0]*(N-1)`` pattern collapses the indicator into one + bidirectional block, shifting the cumsum at the indicator -> first-discrete + boundary by N-1 and breaking the discrete-action CE loss. This test pins + the tail of ``att_masks`` to all 1's after the indicator, so a regression + to the old pattern fails immediately. + """ + method = _embed_prefix_method() + fake = _make_fake_flow_matching() + bsize = 2 + kwargs = _build_default_inputs(batch_size=bsize) + # No optional middle blocks — keeps the prefix layout deterministic so + # the indicator + discrete-action span sits exactly at the tail. + kwargs["subgoal_images"] = [] + kwargs["subgoal_img_masks"] = [] + + num_action_tokens = 3 + kwargs["discrete_actions"] = torch.zeros(bsize, num_action_tokens, dtype=torch.long) + kwargs["discrete_action_masks"] = torch.ones(bsize, num_action_tokens, dtype=torch.bool) + + _, pad_masks, att_masks = method(fake, **kwargs) + + # Expected layout (no optional blocks; fake tokenizer encodes every + # indicator phrase to 2 tokens; ``discrete_action_emb`` matches its + # input shape): + # videos(3) + lang(3) + "State: "(2) + state(1) + ":\n"(2) + # + "Action: "(2) + discrete_actions(3) = 16 + num_indicator_tokens = 2 + base_prefix_len = 11 + total_len = base_prefix_len + num_indicator_tokens + num_action_tokens + assert att_masks.shape == (bsize, total_len) + assert pad_masks.shape == (bsize, total_len) + + # The first ``base_prefix_len`` positions are bidirectional (no causal + # boundaries) — same invariant as test_all_optional_blocks_absent. + assert int(att_masks[0, :base_prefix_len].sum().item()) == 0 + + # Tail invariant: every position from the indicator onward must be 1 + # (per-token causal blocks for the indicator + one causal step per + # discrete action). A regression to ``[1] + [0]*(N-1)`` on the + # indicator would put zeros at indices ``base_prefix_len + 1 .. + # base_prefix_len + num_indicator_tokens - 1`` and the assertion below + # would fail. + tail = att_masks[0, base_prefix_len:] + assert int(tail.sum().item()) == num_indicator_tokens + num_action_tokens, ( + f"expected all-ones tail of length {num_indicator_tokens + num_action_tokens} " + f"(indicator + discrete actions), got {tail.tolist()} — a zero in the indicator " + "span signals a regression to the old [1]+[0]*(N-1) pattern, which shifts the " + "cumsum at the indicator -> first-discrete boundary and breaks the CE loss." + ) From 23c038996ed1416b132af5c68eae131aec542fc2 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 29 Apr 2026 14:56:53 -0700 Subject: [PATCH 5/8] ci: skip pi07_paligemma low-level test (broken at import) to unblock CPU CI (#209) (cherry picked from commit a9dbc4a176e3d40c37ea38ae69095de9314a4ce9) --- .github/workflows/cpu_test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cpu_test.yml b/.github/workflows/cpu_test.yml index 2ed9b5f5..72cf0066 100644 --- a/.github/workflows/cpu_test.yml +++ b/.github/workflows/cpu_test.yml @@ -102,7 +102,9 @@ jobs: python3 -c "import sys; print(sys.path)" python3 -c "import libero.libero" && echo "LIBERO config set successfully." echo "Running cpu based pytest and generating coverage report..." - pytest -m "not gpu" -n auto -v --cov=lerobot/ --cov-report=xml:cpu_test/cpu_test.xml --ignore=tests/planner/test_planner.py --ignore tests/utils/test_libero_utils.py --deselect=tests/envs/test_factory.py::TestMakeEnv::test_make_env_async_vector_env --deselect=tests/envs/test_factory.py::TestMakeEnv::test_make_env_sync_vector_env tests/ + # TODO(#210): drop --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py once pi07 migrates to SpaceTimeSiglipVideoEncoder (#192). + # TODO(#210): drop --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py once pi07 migrates to SpaceTimeSiglipVideoEncoder (#192). + pytest -m "not gpu" -n auto -v --cov=lerobot/ --cov-report=xml:cpu_test/cpu_test.xml --ignore=tests/planner/test_planner.py --ignore tests/utils/test_libero_utils.py --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py --deselect=tests/envs/test_factory.py::TestMakeEnv::test_make_env_async_vector_env --deselect=tests/envs/test_factory.py::TestMakeEnv::test_make_env_sync_vector_env tests/ echo "Pytest execution and coverage report generation completed." - name: Upload coverage reports From 1b0e3a92495fd36339469c1e87c311faf4796312 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Thu, 30 Apr 2026 12:47:11 -0700 Subject: [PATCH 6/8] ci: ignore broken pi07 test in GPU CI; dedupe TODO in cpu_test (#217) (cherry picked from commit 16c46e8b552960d65837ee437835057241a5cbc7) --- .github/workflows/cpu_test.yml | 1 - .github/workflows/gpu_test.yml | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cpu_test.yml b/.github/workflows/cpu_test.yml index 72cf0066..bc6f8be9 100644 --- a/.github/workflows/cpu_test.yml +++ b/.github/workflows/cpu_test.yml @@ -103,7 +103,6 @@ jobs: python3 -c "import libero.libero" && echo "LIBERO config set successfully." echo "Running cpu based pytest and generating coverage report..." # TODO(#210): drop --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py once pi07 migrates to SpaceTimeSiglipVideoEncoder (#192). - # TODO(#210): drop --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py once pi07 migrates to SpaceTimeSiglipVideoEncoder (#192). pytest -m "not gpu" -n auto -v --cov=lerobot/ --cov-report=xml:cpu_test/cpu_test.xml --ignore=tests/planner/test_planner.py --ignore tests/utils/test_libero_utils.py --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py --deselect=tests/envs/test_factory.py::TestMakeEnv::test_make_env_async_vector_env --deselect=tests/envs/test_factory.py::TestMakeEnv::test_make_env_sync_vector_env tests/ echo "Pytest execution and coverage report generation completed." diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml index 453db4b5..9c52a9f5 100644 --- a/.github/workflows/gpu_test.yml +++ b/.github/workflows/gpu_test.yml @@ -91,7 +91,8 @@ jobs: source .venv/bin/activate mkdir -p /tmp/libero-assets/libero/libero export LIBERO_CONFIG_PATH="$(pwd)/.github/assets/libero" - pytest -m "gpu" -n 0 -v tests/ + # TODO(#210): drop --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py once pi07 migrates to SpaceTimeSiglipVideoEncoder (#192). + pytest -m "gpu" -n 0 -v --ignore=tests/policies/test_pi07_paligemma_low_level_planner.py tests/ stop-runner: name: Stop GPU Runner From 080c190adc4c54e01b0679c7cd14ec5072706470 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Wed, 29 Apr 2026 22:24:40 -0700 Subject: [PATCH 7/8] fix(test): override actions stats shape in pi06 GPU smoke MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `test_complete_pi06_pipeline_integration_smoke` was added in #178 with chunk_size=10 but the shared `lerobot_dataset_metadata` fixture provides actions stats shaped (50, 32) — matching the default PI06Config. The Normalize buffer was therefore (50, 32) while the test batch's actions were (B, 10, 32), and MIN_MAX normalization in normalize.py:232 raised ``RuntimeError: The size of tensor a (10) must match the size of tensor b (50) at non-singleton dimension 1``. Pre-existing bug — never caught in CI because the test is gated by @pytest.mark.gpu and skipped in CPU runs. Surfaced now while validating this PR's SDPA + grad-ckpt port on a real GPU. Fix by deep-copying the fixture stats and reshaping the actions max/mean/min/std arrays from (50, 32) to (chunk_size, 32) before constructing the policy. Same numeric values, just the right shape. (cherry picked from commit 6425cb4b490babd7848f2e22519bbe9b196f3301) --- tests/policies/test_pi06.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/policies/test_pi06.py b/tests/policies/test_pi06.py index 5bf32a57..f6b74538 100644 --- a/tests/policies/test_pi06.py +++ b/tests/policies/test_pi06.py @@ -513,7 +513,25 @@ def test_complete_pi06_pipeline_integration_smoke(lerobot_dataset_metadata): config.output_features = {k: ft for k, ft in features.items() if ft.type is FeatureType.ACTION} config.input_features = {k: ft for k, ft in features.items() if k not in config.output_features} - policy = PI06Policy(config, dataset_stats=lerobot_dataset_metadata.stats) + # The shared lerobot_dataset_metadata fixture carries actions stats shaped + # (50, 32) — matching the default PI06Config(chunk_size=50). This test + # uses chunk_size=10 to keep the model small, so override the actions + # stats to (10, 32) before constructing Normalize buffers; otherwise + # `(actions - min) / (max - min + EPS)` mismatches at dim=1 (actions is + # (B, 10, 32) but the buffer is (50, 32)). + import copy + + import numpy as np + + dataset_stats = copy.deepcopy(lerobot_dataset_metadata.stats) + for k in ("max", "mean", "min", "std"): + dataset_stats["actions"][k] = np.full( + (config.chunk_size, 32), + float(dataset_stats["actions"][k].flatten()[0]), + dtype=np.float32, + ) + + policy = PI06Policy(config, dataset_stats=dataset_stats) policy.to(dtype=torch.bfloat16, device="cuda") batch = { From bb947357831589154768a33eeef476343fa84964 Mon Sep 17 00:00:00 2001 From: Shuheng Liu Date: Fri, 1 May 2026 23:27:42 -0700 Subject: [PATCH 8/8] fix(test): subtract Action: indicator len in pi07 LL position-id check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_verify_position_ids` and `_verify_action_expert_attention_mask` in test_pi07_low_level_planner.py only excluded DISCRETE_ACTION_MAX_LENGTH from the cross-attention prefix slice. The model's forward path (modeling_pi07_low_level.py:1575-1620) excludes both the "Action: " indicator len (3 tokens via Gemma 3 tokenizer) AND discrete_action_max_length, matching pi05's discrete_action_indicator_max_length logic — so the action expert sees the same prefix length at train and inference. The test was a latent break since 2949e738 ("Applying pi07_paligemma fixes to pi07") added the indicator subtraction to the model but never updated the test. Surfaces as soon as the GPU test test_complete_pi07_low_level_pipeline runs and asserts suffix_position_ids — actual was prefix_offset+(token), expected was prefix_offset+indicator_len+(token), 3-token gap on Gemma 3. Pull the indicator length from the existing _indicator_lens helper so the fix tracks future tokenizer changes automatically. Plumb the tokenizer through the two _verify_action_expert_attention_mask call sites. --- tests/policies/test_pi07_low_level_planner.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/policies/test_pi07_low_level_planner.py b/tests/policies/test_pi07_low_level_planner.py index 1b233c13..52ef1dc3 100644 --- a/tests/policies/test_pi07_low_level_planner.py +++ b/tests/policies/test_pi07_low_level_planner.py @@ -291,7 +291,14 @@ def _verify_position_ids( if inference_mode: prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None] else: - prefix_offsets = torch.sum(prefix_pad_masks[:, :-DISCRETE_ACTION_MAX_LENGTH], dim=-1)[:, None] + # Training: model's prefix_offsets exclude both the "Action: " indicator + # and the discrete-action span from cross-attention (matches pi05's + # discrete_action_indicator_max_length logic) so the action expert sees + # the same prefix length at train and inference. + action_lead_len = self._indicator_lens(tokenizer)["action_lead"] + prefix_offsets = torch.sum( + prefix_pad_masks[:, : -(action_lead_len + DISCRETE_ACTION_MAX_LENGTH)], dim=-1 + )[:, None] expected_suffix = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1 assert torch.equal(suffix_position_ids, expected_suffix) @@ -312,12 +319,16 @@ def _verify_action_expert_attention_mask( prefix_pad_masks, suffix_pad_masks, suffix_att_masks, + tokenizer, inference_mode=False, ): if inference_mode: num_cross = prefix_pad_masks.shape[1] else: - num_cross = prefix_pad_masks.shape[1] - DISCRETE_ACTION_MAX_LENGTH + # Training: cross-attention excludes both the "Action: " indicator and + # the discrete-action span (mirrors the prefix_offsets logic above). + action_lead_len = self._indicator_lens(tokenizer)["action_lead"] + num_cross = prefix_pad_masks.shape[1] - action_lead_len - DISCRETE_ACTION_MAX_LENGTH expected = make_att_2d_masks( suffix_pad_masks, @@ -478,6 +489,7 @@ def capture_embed_suffix(*args, **kwargs): captured["prefix_pad_masks"], captured["suffix_pad_masks"], captured["suffix_att_masks"], + tokenizer, ) assert isinstance(loss, dict) @@ -586,6 +598,7 @@ def capture_embed_suffix_infer(*args, **kwargs): captured_infer["prefix_pad_masks"], captured_infer["suffix_pad_masks"], captured_infer["suffix_att_masks"], + tokenizer, inference_mode=True, )