From cefcbe2c2ddcd6fb3daa8589e405889e44508e1c Mon Sep 17 00:00:00 2001
From: Richard Abrich <richard.abrich@gmail.com>
Date: Sat, 28 Mar 2026 16:05:20 -0400
Subject: [PATCH] fix: use list input format for Outlines multimodal generation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TransformersMultiModal.format_input is a singledispatch that only
accepts `list` and `Chat` types. A `dict` raises TypeError.

Correct format: [prompt_text, outlines.Image(pil_image)]
Wrong format:   {"text": prompt, "images": [image]}

Also fixes PIL .format being dropped by .convert("RGB") — outlines.Image
requires .format to be set. Restored after conversion.

New test: test_outlines_multimodal_input_format verifies:
- list is a registered dispatch type (dict is NOT)
- outlines.Image wraps PIL images correctly
- This test would have caught both the dict and format bugs

36/36 tests pass in 0.10s.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../training/standalone/trainer.py            | 17 +++++---
 tests/test_standalone_trainer.py              | 43 +++++++++++++++++++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/openadapt_evals/training/standalone/trainer.py b/openadapt_evals/training/standalone/trainer.py
index f5d8e5f..256560b 100644
--- a/openadapt_evals/training/standalone/trainer.py
+++ b/openadapt_evals/training/standalone/trainer.py
@@ -200,7 +200,11 @@ def _collect_rollout(self, task_id: str, instruction: str) -> Rollout:
                 logger.info("Stuck at step %d", step_idx)
                 break
 
-            image = Image.open(io.BytesIO(screenshot)).convert("RGB")
+            image = Image.open(io.BytesIO(screenshot))
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+                # .convert() drops .format; restore it for outlines.Image
+                image.format = "PNG"
             messages = build_agent_messages(instruction, include_image=True)
             if hasattr(self._processor, "apply_chat_template"):
                 text_input = self._processor.apply_chat_template(
@@ -215,10 +219,13 @@ def _collect_rollout(self, task_id: str, instruction: str) -> Rollout:
                 else None
             )
             if outlines_gen is not None:
-                # Outlines v1.2 Generator API: handles tokenization,
-                # generation, and decoding internally.  For multimodal
-                # models, pass a dict with "text" + image keys.
-                model_input = {"text": text_input, "images": [image]}
+                # Outlines v1.2 Generator API for multimodal models.
+                # TransformersMultiModal.format_input dispatches on type:
+                #   list  → [prompt_text, Image(pil), ...]
+                #   Chat  → Chat([Message(...)])
+                # A dict is NOT accepted (raises TypeError).
+                import outlines
+                model_input = [text_input, outlines.Image(image)]
                 decoded = outlines_gen(
                     model_input,
                     max_new_tokens=self._config.max_new_tokens,
diff --git a/tests/test_standalone_trainer.py b/tests/test_standalone_trainer.py
index 9024485..2e158d2 100644
--- a/tests/test_standalone_trainer.py
+++ b/tests/test_standalone_trainer.py
@@ -203,6 +203,49 @@ def test_outlines_generator_api_contract(self) -> None:
             for p in params_call
         ), f"SteerableGenerator.__call__ doesn't accept **kwargs: {sig_call}"
 
+    def test_outlines_multimodal_input_format(self) -> None:
+        """Verify outlines TransformersMultiModal accepts list input, not dict.
+
+        This is THE test that catches the input format bug. The trainer
+        must pass [prompt, outlines.Image(pil)] not {"text": ..., "images": ...}.
+
+        TransformersMultiModalTypeAdapter.format_input is a singledispatch
+        that only accepts `list` and `Chat` types. A `dict` raises TypeError.
+        """
+        try:
+            import outlines
+            from outlines.models.transformers import TransformersMultiModalTypeAdapter
+        except ImportError:
+            pytest.skip("outlines not installed")
+
+        # Verify list is a registered dispatch type by checking the
+        # class-level dispatcher registry (singledispatchmethod stores
+        # it on the descriptor, not the bound method).
+        fmt = TransformersMultiModalTypeAdapter.__dict__["format_input"]
+        registry = fmt.dispatcher.registry
+        registered_types = set(registry.keys())
+        assert list in registered_types, (
+            f"list not registered in format_input dispatch: {registered_types}. "
+            f"The trainer passes [prompt, Image(pil)] — this type must be accepted."
+        )
+        assert dict not in registered_types, (
+            "dict is registered in format_input — if this changes, the trainer's "
+            "input format can be simplified back to a dict."
+        )
+
+        # Verify outlines.Image exists and wraps PIL images
+        assert hasattr(outlines, "Image"), "outlines.Image not found"
+        from PIL import Image as PILImage
+        import io
+        test_img = PILImage.new("RGB", (10, 10))
+        # outlines.Image requires .format to be set (loaded from file)
+        buf = io.BytesIO()
+        test_img.save(buf, format="PNG")
+        buf.seek(0)
+        test_img_with_format = PILImage.open(buf)
+        wrapped = outlines.Image(test_img_with_format)
+        assert wrapped is not None
+
     def test_false_sentinel_not_confused_with_none(self) -> None:
         """Regression: False sentinel must return None, not be treated as uninitialized."""
         config = TrainingConfig(constrained_decoding=True)