fix(pi07): emit obs_history_is_pad at inference; address review nits

shuheng-liu · shuheng-liu · commit 57716e4872d7 · 2026-05-04T12:02:46.000-07:00
Follow-up to the review on PR #253. Main change — emit obs_history_is_pad from _build_history_batch. Previously the inference path never threaded the mask through select_action -> sample_actions -> embed_prefix, so the encoder fell back to "all history padded except current" via embed_prefix's zeros + state_mask[:, -1] = True branch and the encoder's matching None-fallback. That fixed the start-of-episode contamination but silently masked genuine mid-episode history once the buffer filled. _build_history_batch already owns the idx < 0 decision for each slot; emitting obs_history_is_pad = (i*interval - missing < 0) for i in range(n_hist) costs nothing and lets the encoder use real mid-episode history while still masking only the truly-padded start-of-episode slots. Threaded through PI07LowLevelPolicy. sample_actions and PI07LowLevelFlowMatching.sample_actions to reach embed_prefix with the live mask. The encoder's None-fallback and embed_prefix's zeros-then-current branch are both kept as defensive: anything that bypasses _build_history_batch (direct sample_actions calls in tests, etc.) still gets a safe default that matches training-time augmentations. Nits from review: - Removed redundant .clone() in embed_prefix's state_mask path and in _build_temporal_attn_mask's key_valid path. Both ~obs_history_is_pad expressions already allocate fresh tensors, so subsequent [:, -1] = True writes can't reach the caller's obs_history_is_pad. Kept the regression tests that pinned the no-mutation property. - Rewrote the SpaceTimeEncoderLayerWrapper.forward docstring to drop the misleading "ignored by vanilla layers via SiglipEncoder's positional dispatch" claim — the encoder now branches on isinstance and never calls SiglipEncoder.forward, so vanilla layers simply aren't passed temporal_attn_mask. Skipped the repeat_interleave perf nit: the suggested expand+reshape materializes anyway because SDPA flattens (B, N) into one dim. Real fix is a deeper refactor of the temporal Q/K/V layout; deferred. New tests in TestBuildHistoryBatchEmitsObsHistoryIsPad (test_pi07_cpu.py) — pin the (B, T) mask shape, the first-step all-but-current pattern, the buffer-full all-False pattern, the partial-fill mid-episode pattern, batch broadcasting, and slot-by-slot agreement with the actual zero-fill of state and camera tensors.
diff --git a/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py b/src/opentau/policies/pi07/low_level/modeling_pi07_low_level.py
@@ -430,17 +430,22 @@ def _build_history_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
 
         Appends the single-frame observation from ``batch`` to internal deque
         buffers, then assembles a batch with ``n_obs_history`` evenly-spaced
-        frames (interval = ``history_interval``).  Early in an episode, missing
-        history slots are zero-padded.
+        frames (interval = ``history_interval``). Early in an episode the
+        buffer is partially filled, so some slots are zero-padded; the
+        returned ``"obs_history_is_pad"`` (B, T) bool tensor flags those
+        slots ``True`` so the model can mask them out of attention. Once the
+        buffer is full (typically a handful of steps in), the mask is all
+        ``False`` and the encoder uses the real history.
 
         Expected batch keys:
             - ``"state"``: (B, D) current proprioceptive state.
             - image keys matching ``config.image_features``: (B, C, H, W) camera frames.
             - ``"prompt"``: list[str] language instructions (passed through unchanged).
             - Any other metadata keys are forwarded unchanged.
 
-        Returns a new dict with ``"state"`` expanded to (B, T, D) and image keys
-        expanded to (B, T, C, H, W), where T = ``n_obs_history``.
+        Returns a new dict with ``"state"`` expanded to (B, T, D), image keys
+        expanded to (B, T, C, H, W), and a new ``"obs_history_is_pad"`` (B, T)
+        bool tensor (``True`` = padded). T = ``n_obs_history``.
         """
         assert self.config.n_obs_history is not None
         n_hist: int = self.config.n_obs_history
@@ -465,6 +470,8 @@ def _build_history_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
         # sample n_hist frames at the configured interval
         buf_len = len(self._state_buffer)
         missing = buf_maxlen - buf_len  # how many slots are still empty
+        bsize = batch["state"].shape[0]
+        device = batch["state"].device
 
         # Pass through all non-image, non-state keys (e.g. "prompt" and other metadata).
         temporal_batch = {key: v for key, v in batch.items() if key not in img_keys and key != "state"}
@@ -490,6 +497,21 @@ def _build_history_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]:
                     cam_frames.append(self._obs_buffers[key][idx])
             temporal_batch[key] = torch.stack(cam_frames, dim=1)  # (B, T, C, H, W)
 
+        # Same `idx < 0` decision as the loops above: a slot is padded iff the
+        # buffer didn't have an entry to fill it. The pattern is identical
+        # for state and every camera (they share the same buffer length), so
+        # we emit one (B, T) mask. Broadcast across batch — every sample sees
+        # the same padding pattern at any given step. Without this, the
+        # encoder's None-fallback masks ALL history at inference (including
+        # genuine mid-episode frames once the buffer is full); with it, only
+        # the actually-padded start-of-episode slots get masked.
+        pad_pattern = torch.tensor(
+            [i * interval - missing < 0 for i in range(n_hist)],
+            dtype=torch.bool,
+            device=device,
+        )
+        temporal_batch["obs_history_is_pad"] = pad_pattern.unsqueeze(0).expand(bsize, n_hist)
+
         return temporal_batch
 
     @torch.no_grad()
@@ -564,7 +586,12 @@ def sample_actions(
 
         batch = self.normalize_inputs(batch)
 
-        videos, vid_masks = self.prepare_videos(batch)
+        # `_build_history_batch` (called from `select_action` upstream) emits
+        # this; it's None when the caller skipped that step (e.g. n_obs_history
+        # is None/1, or sample_actions is invoked directly without the buffer).
+        obs_history_is_pad = batch.get("obs_history_is_pad")
+
+        videos, vid_masks = self.prepare_videos(batch, obs_history_is_pad=obs_history_is_pad)
         lang_tokens, lang_masks = self.prepare_language(batch)
         response_tokens, response_masks = self.prepare_response(batch)
         state = self.prepare_state(batch)
@@ -616,6 +643,7 @@ def sample_actions(
             metadata_masks=metadata_masks,
             response_tokens=response_tokens,
             response_masks=response_masks,
+            obs_history_is_pad=obs_history_is_pad,
         )
 
         action_feature = self.config.action_feature
@@ -1299,7 +1327,7 @@ def embed_prefix(
         state_emb = self.state_proj(state.to(dtype=_preferred_dtype()))
         num_state_tokens = state_emb.shape[1]  # T
         if obs_history_is_pad is not None:
-            state_mask = ~obs_history_is_pad  # (B, T)
+            state_mask = ~obs_history_is_pad  # (B, T) — `~` allocates a fresh tensor
         else:
             # Absent → assume all history is padded; only current step is real.
             state_mask = torch.zeros(bsize, num_state_tokens, dtype=torch.bool, device=state.device)
@@ -1308,7 +1336,9 @@ def embed_prefix(
         # all-True. Without this override the policy would condition on no
         # state at all, since attention to the current state token would be
         # masked out — defeating the purpose of preserving the current frame.
-        state_mask = state_mask.clone()  # avoid in-place mutation of obs_history_is_pad
+        # Both branches above produce fresh tensors (`~` allocates;
+        # `torch.zeros` allocates), so the `[:, -1] = True` write below does
+        # not reach the caller's `obs_history_is_pad`.
         state_mask[:, -1] = True
 
         embs.append(state_emb)
@@ -1754,6 +1784,7 @@ def sample_actions(
         metadata_masks: Tensor | None = None,
         response_tokens: Tensor | None = None,
         response_masks: Tensor | None = None,
+        obs_history_is_pad: Tensor | None = None,
     ) -> Tensor:
         """Inference: iteratively denoise to produce a continuous action chunk.
 
@@ -1778,6 +1809,13 @@ def sample_actions(
             metadata_masks: Optional mask for metadata tokens.
             response_tokens: Optional subtask response token IDs.
             response_masks: Optional mask for response tokens.
+            obs_history_is_pad: Optional ``(B, T)`` bool mask flagging padded
+                history slots (``True`` = padded). Emitted by
+                ``PI07LowLevelPolicy._build_history_batch`` so the encoder can
+                use real mid-episode history while still masking out the
+                start-of-episode zero-fill. ``None`` falls back to "all
+                history padded except current" via ``embed_prefix`` and the
+                encoder's None-fallback.
 
         Returns:
             Denoised action chunk ``(B, chunk_size, max_action_dim)``.
@@ -1801,6 +1839,7 @@ def sample_actions(
             metadata_masks,
             subgoal_images=subgoal_images,
             subgoal_img_masks=subgoal_img_masks,
+            obs_history_is_pad=obs_history_is_pad,
         )
         prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
         prefix_position_ids = torch.cumsum(prefix_pad_masks, dim=1) - 1
diff --git a/src/opentau/policies/pi07/low_level/video_encoder.py b/src/opentau/policies/pi07/low_level/video_encoder.py
@@ -291,8 +291,10 @@ def forward(
         """hidden_states: (B*T, N, D) -> tuple starting with (B*T, N, D).
 
         Signature extends ``SiglipEncoderLayer.forward`` with an extra
-        ``temporal_attn_mask`` kwarg (ignored by vanilla layers via
-        ``SiglipEncoder``'s positional dispatch).
+        ``temporal_attn_mask`` kwarg. Vanilla ``SiglipEncoderLayer`` instances
+        never receive it: ``SpaceTimeSiglipVideoEncoder.forward`` dispatches
+        it only to ``SpaceTimeEncoderLayerWrapper`` instances via an
+        ``isinstance`` check, bypassing ``SiglipEncoder.forward`` entirely.
 
         Args:
             temporal_attn_mask: Optional ``(B*N, 1, T, T)`` additive float mask
@@ -506,8 +508,9 @@ def _build_temporal_attn_mask(
         # callers (e.g. the dataset's history_state_drop_prob augmentation)
         # set obs_history_is_pad to all-True; without this override, the
         # current frame would have no key to attend to and produce NaNs.
+        # `~obs_history_is_pad` allocates a fresh tensor, so the in-place
+        # write below does not reach the caller's `obs_history_is_pad`.
         key_valid = ~obs_history_is_pad  # (B, T)
-        key_valid = key_valid.clone()  # avoid in-place mutation of caller's tensor
         key_valid[:, -1] = True
 
         # Combined: (B, T_query, T_key)
diff --git a/tests/policies/test_pi07_cpu.py b/tests/policies/test_pi07_cpu.py
@@ -1529,3 +1529,134 @@ def _build(with_actions: bool):
         torch.testing.assert_close(embs_train[:, :infer_len], embs_infer)
         torch.testing.assert_close(pad_train[:, :infer_len], pad_infer)
         torch.testing.assert_close(att_train[:, :infer_len], att_infer)
+
+
+# `_build_history_batch` emits ``obs_history_is_pad`` so the encoder can use
+# real mid-episode history while still masking start-of-episode zero-fill.
+# Without this emit, the encoder's None-fallback masks ALL history at
+# inference (mid-episode regression flagged in the PR #253 review).
+
+
+class TestBuildHistoryBatchEmitsObsHistoryIsPad:
+    @staticmethod
+    def _make_policy_stub(*, n_obs_history: int, history_interval: int, image_keys: list[str]):
+        """Construct a partial PI07LowLevelPolicy that exposes only the
+        attrs ``_build_history_batch`` reads: ``config.{n_obs_history,
+        history_interval, obs_buffer_size, image_features}`` plus the deque
+        slots. Skips Gemma 3 init so the test stays CPU-cheap.
+        """
+        import types
+
+        from opentau.policies.pi07.low_level.modeling_pi07_low_level import (
+            PI07LowLevelPolicy,
+        )
+
+        policy = PI07LowLevelPolicy.__new__(PI07LowLevelPolicy)
+        buf_size = (n_obs_history - 1) * history_interval + 1
+        policy.config = types.SimpleNamespace(
+            n_obs_history=n_obs_history,
+            history_interval=history_interval,
+            obs_buffer_size=buf_size,
+            image_features=dict.fromkeys(image_keys),
+        )
+        policy._state_buffer = None
+        policy._obs_buffers = None
+        return policy
+
+    def _make_batch(self, image_keys: list[str], state_dim: int = 4) -> dict:
+        return {
+            "state": torch.zeros(1, state_dim),
+            **{k: torch.zeros(1, 3, 8, 8) for k in image_keys},
+        }
+
+    def test_first_step_marks_all_but_current_padded(self):
+        """At episode start, only the very first observation is in the
+        buffer; every other slot in the requested history was zero-filled.
+        Mask should be ``[True, ..., True, False]`` — the canonical case
+        the PR's Bug A fix protects against contamination from.
+        """
+        policy = self._make_policy_stub(n_obs_history=4, history_interval=1, image_keys=["camera0"])
+        out = policy._build_history_batch(self._make_batch(["camera0"]))
+
+        assert "obs_history_is_pad" in out
+        assert out["obs_history_is_pad"].shape == (1, 4)
+        assert out["obs_history_is_pad"].dtype == torch.bool
+        assert out["obs_history_is_pad"].tolist() == [[True, True, True, False]]
+
+    def test_buffer_full_emits_all_false(self):
+        """Once the buffer is full (after ``obs_buffer_size`` calls), every
+        slot maps to a real observation — mask is all-False. This is the
+        mid-episode case the previous PR regressed: with the None-fallback,
+        the encoder masked these real frames out as if they were padded.
+        """
+        policy = self._make_policy_stub(n_obs_history=4, history_interval=2, image_keys=["camera0"])
+        # obs_buffer_size = (4-1)*2 + 1 = 7. Need 7 calls to fill.
+        batch = self._make_batch(["camera0"])
+        for _ in range(7):
+            out = policy._build_history_batch(batch)
+        assert out["obs_history_is_pad"].tolist() == [[False, False, False, False]]
+
+    def test_partial_fill_marks_only_unfilled_slots(self):
+        """After ``k < obs_buffer_size`` calls, the leading ``T -
+        ceil(k / interval)`` slots are still virtual past-steps. With
+        ``n_obs_history=4, history_interval=2`` (buffer_size=7), after 4
+        calls the deque has 4 entries → ``missing = 3`` → slots with
+        ``i*interval - 3 < 0`` are padded: i=0 → -3 (T), i=1 → -1 (T),
+        i=2 → 1 (F), i=3 → 3 (F). So mask = [T, T, F, F].
+        """
+        policy = self._make_policy_stub(n_obs_history=4, history_interval=2, image_keys=["camera0"])
+        batch = self._make_batch(["camera0"])
+        for _ in range(4):
+            out = policy._build_history_batch(batch)
+        assert out["obs_history_is_pad"].tolist() == [[True, True, False, False]]
+
+    def test_mask_is_broadcast_over_batch(self):
+        """The buffer is shared across batch elements (every sample sees
+        the same buffer length at any given step), so the (B, T) mask is
+        the same across the batch dim. Verify by emitting from a B=3 batch.
+        """
+        policy = self._make_policy_stub(n_obs_history=4, history_interval=1, image_keys=["camera0"])
+        batch = {
+            "state": torch.zeros(3, 4),
+            "camera0": torch.zeros(3, 3, 8, 8),
+        }
+        out = policy._build_history_batch(batch)
+
+        assert out["obs_history_is_pad"].shape == (3, 4)
+        # Every batch element sees the same mask.
+        assert torch.all(out["obs_history_is_pad"] == out["obs_history_is_pad"][0:1])
+
+    def test_n_obs_history_one_emits_all_false(self):
+        """With ``n_obs_history=1`` the buffer always contains the current
+        frame — no historical slots exist, so the (B, 1) mask is False
+        from step 1. (In practice ``select_action`` skips
+        ``_build_history_batch`` entirely when ``n_obs_history <= 1``, so
+        this is just defending the function's own contract.)
+        """
+        policy = self._make_policy_stub(n_obs_history=1, history_interval=1, image_keys=["camera0"])
+        out = policy._build_history_batch(self._make_batch(["camera0"]))
+        assert out["obs_history_is_pad"].tolist() == [[False]]
+
+    def test_state_and_camera_padding_match_emitted_mask(self):
+        """The emitted mask must agree slot-for-slot with the actual
+        zero-padding pattern of state and camera tensors. State / camera
+        are zeroed where ``idx < 0``; the mask flags the same slots ``True``.
+        """
+        policy = self._make_policy_stub(n_obs_history=3, history_interval=1, image_keys=["camera0"])
+        # Inject a non-zero observation so we can detect zero-fill.
+        batch = {
+            "state": torch.full((1, 4), 7.0),
+            "camera0": torch.full((1, 3, 8, 8), 5.0),
+        }
+        out = policy._build_history_batch(batch)
+        # After one call: missing = 2; mask = [True, True, False].
+        is_pad = out["obs_history_is_pad"][0]  # (T,)
+        state = out["state"][0]  # (T, D)
+        cam = out["camera0"][0]  # (T, C, H, W)
+        for t, padded in enumerate(is_pad.tolist()):
+            if padded:
+                assert torch.all(state[t] == 0.0), f"state[{t}] not zero-filled"
+                assert torch.all(cam[t] == 0.0), f"camera[{t}] not zero-filled"
+            else:
+                assert torch.all(state[t] == 7.0), f"state[{t}] zero-filled but mask says real"
+                assert torch.all(cam[t] == 5.0), f"camera[{t}] zero-filled but mask says real"