TensorAuto · shuheng-liu · May 4, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/src/opentau/datasets/__init__.py b/src/opentau/datasets/__init__.py
@@ -22,7 +22,7 @@
 
     - **Core Datasets**: LeRobotDataset for robot learning data with support for
       temporal alignment, multi-modal data, and version compatibility.
-    - **VQA Datasets**: Vision-language datasets (CLEVR, COCO-QA, PIXMO, VSR)
+    - **VQA Datasets**: Vision-language datasets (CLEVR, COCO-QA, VSR)
       for training visual understanding without robot actions.
     - **Dataset Mixtures**: WeightedDatasetMixture for combining multiple datasets
       with controlled sampling proportions.
@@ -53,7 +53,7 @@
 Main Modules:
 
     - **lerobot_dataset**: Core dataset implementation for robot learning data.
-    - **vqa**: Vision-language vqa datasets (CLEVR, COCO-QA, PIXMO, VSR).
+    - **vqa**: Vision-language vqa datasets (CLEVR, COCO-QA, VSR).
     - **dataset_mixture**: Weighted combination of multiple datasets.
     - **factory**: Factory functions for creating datasets from configurations.
     - **utils**: Utility functions for I/O, metadata management, and validation.
@@ -80,5 +80,5 @@
 
         >>> from opentau import available_vqa_datasets
         >>> print(list(available_vqa_datasets.keys()))
-        ['clevr', 'cocoqa', 'dummy', 'pixmo', 'vsr']
+        ['clevr', 'cocoqa', 'dummy', 'vsr']
 """
diff --git a/src/opentau/datasets/factory.py b/src/opentau/datasets/factory.py
@@ -26,7 +26,7 @@
     1. LeRobot datasets: Standard robot learning datasets loaded from HuggingFace
        repositories with configurable delta timestamps for temporal alignment.
     2. VQA datasets: Vision-language vqa datasets (CLEVR, COCO-QA,
-       PIXMO, VSR, etc.) for multimodal learning tasks.
+       VSR, etc.) for multimodal learning tasks.
 
 Key Features:
     - Delta timestamp resolution: Automatically configures temporal offsets
@@ -71,7 +71,6 @@
 import opentau.datasets.vqa.clevr  # noqa: F401
 import opentau.datasets.vqa.cocoqa  # noqa: F401
 import opentau.datasets.vqa.dummy  # noqa: F401
-import opentau.datasets.vqa.pixmo  # noqa: F401
 import opentau.datasets.vqa.vsr  # noqa: F401
 from opentau import available_vqa_datasets
 from opentau.configs.default import DatasetConfig

diff --git a/src/opentau/datasets/grounding/__init__.py b/src/opentau/datasets/grounding/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for encoding spatial outputs as PaliGemma-style location tokens.
+
+PaliGemma reserves 1024 single-token IDs `<loc0000>`..`<loc1023>` that
+quantize a coordinate axis into 1024 bins. Bounding boxes and points are
+emitted as plain strings (`<locYMIN><locXMIN><locYMAX><locXMAX> label`),
+which the standard tokenizer turns into a single integer per `<locNNNN>`.
+
+Two helpers live here:
+
+- ``loc_codec``: pure functions to convert pixel coordinates to/from
+  the loc-token string format. No torch dependency.
+- ``tokenizer_utils.ensure_loc_tokens``: makes the loc strings available
+  on any HuggingFace tokenizer. A no-op for PaliGemma (already shipped).
+  For Gemma 3 (and any other base tokenizer) it appends them as special
+  tokens and, when given a model handle, resizes the embedding table to
+  match.
+
+Concrete grounding datasets (PixMo-points, RefCOCO, …) are NOT yet shipped
+under this package — see the follow-up tracking the configurable response
+formatter that will make them config-driven rather than one class per source.
+"""
diff --git a/src/opentau/datasets/grounding/loc_codec.py b/src/opentau/datasets/grounding/loc_codec.py
@@ -0,0 +1,174 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Codec between pixel coordinates and PaliGemma `<locNNNN>` strings.
+
+PaliGemma's 1024-bin grounding format quantizes a coordinate axis into
+10 bits and emits each bin as a single `<locNNNN>` token (zero-padded to
+four digits — `<loc23>` does not match the tokenizer). A bounding box is
+encoded as four loc tokens in **(y_min, x_min, y_max, x_max)** order
+(y-then-x; do not swap), then a space, then the label, and `; ` separates
+multiple boxes:
+
+    "<loc0234><loc0567><loc0890><loc1023> dog ; <loc0010><loc0050><loc0200><loc0500> cat"
+
+A point is two loc tokens in **(y, x)** order:
+
+    "<loc0234><loc0567> spout"
+
+The 1024 grid is **abstract** — it is not the input image resolution.
+Coordinates are normalized using the original image dimensions, then
+quantized as ``int(round(coord_norm * 1023))`` and clamped to ``[0, 1023]``.
+Pass the original image's `(width, height)` from the dataset (e.g.
+``Image.open(...).size``), NOT the post-resize tensor shape that the
+policy actually consumes.
+
+TODO: eval-side decoding will use ``loc_tokens_to_xyxy`` /
+``loc_tokens_to_points`` against decoded response strings to recover
+bounding boxes for IoU/mAP. Tracked as a follow-up to the configurable
+response-formatter work.
+"""
+
+from __future__ import annotations
+
+import re
+
+NUM_BINS = 1024
+MAX_BIN = NUM_BINS - 1
+
+_LOC_TOKEN_RE = re.compile(r"<loc(\d{4})>")
+
+# Segment separator the encoder emits between adjacent box/point entries
+# (e.g. ``"<...> dog ; <...> cat"``). Decoders split on this so that a
+# malformed segment cannot misalign every subsequent one.
+SEGMENT_SEPARATOR = ";"
+
+
+def _quantize(coord: float, extent: float) -> int:
+    """Map a single pixel coordinate to a `[0, 1023]` bin index.
+
+    Args:
+        coord: Pixel coordinate (e.g. an x or y in original-image space).
+        extent: The image dimension along this axis (width for x, height for y).
+
+    Returns:
+        An integer bin index in ``[0, 1023]``.
+    """
+    if extent <= 0:
+        return 0
+    bin_idx = int(round((coord / extent) * MAX_BIN))
+    if bin_idx < 0:
+        return 0
+    if bin_idx > MAX_BIN:
+        return MAX_BIN
+    return bin_idx
+
+
+def _dequantize(bin_idx: int, extent: float) -> float:
+    """Inverse of `_quantize`: map a bin index back to a pixel coordinate."""
+    return (bin_idx / MAX_BIN) * extent
+
+
+def _loc(bin_idx: int) -> str:
+    return f"<loc{bin_idx:04d}>"
+
+
+def xyxy_to_loc_tokens(box_xyxy: tuple[float, float, float, float], img_w: int, img_h: int) -> str:
+    """Encode an `(x_min, y_min, x_max, y_max)` box as four loc tokens.
+
+    The output order is `<loc Y_min><loc X_min><loc Y_max><loc X_max>`
+    (y-then-x), matching PaliGemma's convention.
+
+    Args:
+        box_xyxy: ``(x_min, y_min, x_max, y_max)`` in pixel coordinates of the
+            **original** image.
+        img_w: Original image width in pixels.
+        img_h: Original image height in pixels.
+
+    Returns:
+        A four-token string with no separators.
+    """
+    x_min, y_min, x_max, y_max = box_xyxy
+    return (
+        _loc(_quantize(y_min, img_h))
+        + _loc(_quantize(x_min, img_w))
+        + _loc(_quantize(y_max, img_h))
+        + _loc(_quantize(x_max, img_w))
+    )
+
+
+def xywh_to_loc_tokens(box_xywh: tuple[float, float, float, float], img_w: int, img_h: int) -> str:
+    """Same as `xyxy_to_loc_tokens` but accepts COCO-style ``(x, y, w, h)``."""
+    x, y, w, h = box_xywh
+    return xyxy_to_loc_tokens((x, y, x + w, y + h), img_w, img_h)
+
+
+def point_to_loc_tokens(x: float, y: float, img_w: int, img_h: int) -> str:
+    """Encode an `(x, y)` point as two loc tokens in y-then-x order."""
+    return _loc(_quantize(y, img_h)) + _loc(_quantize(x, img_w))
+
+
+def loc_tokens_to_xyxy(s: str, img_w: int, img_h: int) -> list[tuple[float, float, float, float]]:
+    """Parse a string of loc tokens into `(x_min, y_min, x_max, y_max)` pixel boxes.
+
+    Tolerant and segment-aware: the input is split on the encoder's segment
+    separator (``;``), and each segment must contribute exactly four loc
+    tokens to yield a box. A segment with any other count (0, 1, 2, 3, 5,
+    ...) is dropped silently — its tokens do NOT spill into the next
+    segment, so a single malformed box cannot misalign every subsequent one.
+    Garbage strings or partial decodes return ``[]``.
+
+    Args:
+        s: A string that may contain `<locNNNN>` tokens, e.g. a decoded
+            response. Non-loc text within a segment is ignored.
+        img_w: Original image width in pixels.
+        img_h: Original image height in pixels.
+
+    Returns:
+        A list of `(x_min, y_min, x_max, y_max)` tuples in pixel coordinates.
+    """
+    boxes: list[tuple[float, float, float, float]] = []
+    for segment in s.split(SEGMENT_SEPARATOR):
+        bins = [int(m) for m in _LOC_TOKEN_RE.findall(segment)]
+        if len(bins) != 4:
+            continue
+        y_min_b, x_min_b, y_max_b, x_max_b = bins
+        boxes.append(
+            (
+                _dequantize(x_min_b, img_w),
+                _dequantize(y_min_b, img_h),
+                _dequantize(x_max_b, img_w),
+                _dequantize(y_max_b, img_h),
+            )
+        )
+    return boxes
+
+
+def loc_tokens_to_points(s: str, img_w: int, img_h: int) -> list[tuple[float, float]]:
+    """Parse a string of loc tokens into `(x, y)` pixel points.
+
+    Tolerant and segment-aware in the same sense as `loc_tokens_to_xyxy`:
+    the input is split on ``;``, and each segment must contribute exactly
+    two loc tokens (in `(y, x)` order per the PaliGemma convention) to
+    yield a point. Segments with any other count are dropped — a malformed
+    segment cannot shift later ones.
+    """
+    points: list[tuple[float, float]] = []
+    for segment in s.split(SEGMENT_SEPARATOR):
+        bins = [int(m) for m in _LOC_TOKEN_RE.findall(segment)]
+        if len(bins) != 2:
+            continue
+        y_b, x_b = bins
+        points.append((_dequantize(x_b, img_w), _dequantize(y_b, img_h)))
+    return points
diff --git a/src/opentau/datasets/grounding/tokenizer_utils.py b/src/opentau/datasets/grounding/tokenizer_utils.py
@@ -0,0 +1,121 @@
+# Copyright 2026 Tensor Auto Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenizer-side support for PaliGemma `<loc0000>`..`<loc1023>` tokens.
+
+Two cases must be handled, both via the same call:
+
+1. **PaliGemma (`google/paligemma-3b-pt-224`).** The 1024 loc strings are
+   already in the SentencePiece vocab at IDs ``256000``..``257023``. They
+   are NOT, however, registered as **added tokens**, so the bare HF tokenizer
+   BPE-fragments any `<loc0000>`-shaped string into seven pieces
+   (``['<', 'loc', '0', '0', '0', '0', '>']``) instead of matching it as one
+   unit at ID 256000. Calling ``add_tokens`` with an ``AddedToken`` whose
+   string already exists in the vocab is the documented HF mechanism to
+   *promote* an existing entry to single-token-match status without
+   reassigning its ID. No new vocab slots are created and no embedding
+   resize is needed.
+2. **Gemma 3 (`google/gemma-3-4b-pt`).** The strings are absent. The same
+   ``add_tokens`` call appends 1024 new IDs at the end of the vocab; the
+   model's embedding table and tied LM head must be resized to match. The
+   new rows are random-init — they learn from the grounding data on first
+   use. There is no PaliGemma loc-embedding transfer.
+
+The utility below covers both cases idempotently, so it can be wired into
+every policy ``__init__`` defensively.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import torch
+from transformers.tokenization_utils_base import AddedToken
+
+LOC_TOKENS: tuple[str, ...] = tuple(f"<loc{i:04d}>" for i in range(1024))
+
+# Fixed seed used to initialize the new `<locNNNN>` embedding rows on
+# Gemma 3. Hardcoded (not a tunable) so policy construction is bit-stable
+# regardless of when in setup `ensure_loc_tokens` fires. CLAUDE.md hard
+# rule #3 (deterministic seeded reruns of the training loop) depends on
+# this — if the resize were to consume the active RNG, two seeded runs
+# could diverge purely from where the helper is called, even though the
+# loop seed is identical.
+_LOC_EMBEDDING_INIT_SEED: int = 0x10CC0DE
+
+_logger = logging.getLogger(__name__)
+
+
+def ensure_loc_tokens(tokenizer, model=None) -> int:
+    """Idempotently make `<loc0000>`..`<loc1023>` available as single tokens.
+
+    Always promotes the 1024 loc strings to added/special tokens via
+    ``tokenizer.add_tokens``. For PaliGemma this is a no-op vocab-size-wise:
+    the strings already live at the reserved IDs ``256000``..``257023``, and
+    the call only flips them into single-token match mode. For Gemma 3 the
+    1024 strings are appended as new IDs, and the model's embedding table
+    and tied LM head are resized via ``model.resize_token_embeddings`` when
+    a model handle is supplied.
+
+    The embedding resize is wrapped in a snapshot/restore of the global torch
+    RNG (CPU + all visible CUDA devices) and re-seeded with a fixed constant
+    inside that block. This guarantees the 1024 new rows are bit-identical
+    across runs regardless of when in construction the helper is called, and
+    leaves the outer RNG state untouched so downstream consumers (the loop
+    seed, dataset shuffler, dropout, etc.) are not perturbed. Without this,
+    construction-time embedding init would couple to the active RNG and
+    silently violate CLAUDE.md hard rule #3 (deterministic seeded reruns).
+
+    Safe to call multiple times — once the strings are registered as added
+    tokens, subsequent calls neither grow the vocab nor resize.
+
+    Args:
+        tokenizer: A HuggingFace `PreTrainedTokenizer` / `PreTrainedTokenizerFast`.
+        model: Optional `PreTrainedModel` whose embeddings should be resized
+            when new IDs are assigned. Pass the top-level VLM (e.g. the
+            ``Gemma3ForConditionalGeneration`` /
+            ``PaliGemmaForConditionalGeneration`` instance) — HF's
+            ``resize_token_embeddings`` dispatches through
+            ``get_input_embeddings`` / ``set_input_embeddings`` to the
+            language model and updates the tied LM head as well.
+
+    Returns:
+        The number of NEW IDs appended to the tokenizer vocab. Always 0 for
+        PaliGemma; 1024 on the first call against a fresh Gemma 3 tokenizer;
+        0 for any subsequent call.
+    """
+    initial_len = len(tokenizer)
+    added_tokens = [AddedToken(t, special=True, normalized=False) for t in LOC_TOKENS]
+    tokenizer.add_tokens(added_tokens, special_tokens=True)
+    n_new_ids = len(tokenizer) - initial_len
+
+    if n_new_ids > 0 and model is not None:
+        # Fork RNG so the resize's random init does not consume entropy from
+        # the caller's RNG stream and is reproducible across runs. We fork
+        # CPU + every visible CUDA device; the seed inside the fork is fixed.
+        cuda_devices = list(range(torch.cuda.device_count())) if torch.cuda.is_available() else []
+        with torch.random.fork_rng(devices=cuda_devices, enabled=True):
+            torch.manual_seed(_LOC_EMBEDDING_INIT_SEED)
+            if cuda_devices:
+                torch.cuda.manual_seed_all(_LOC_EMBEDDING_INIT_SEED)
+            model.resize_token_embeddings(len(tokenizer))
+
+    if n_new_ids > 0:
+        _logger.info(
+            "ensure_loc_tokens: appended %d <locNNNN> token IDs (new vocab size %d); embeddings %sresized.",
+            n_new_ids,
+            len(tokenizer),
+            "" if model is not None else "NOT ",
+        )
+    return n_new_ids
diff --git a/src/opentau/datasets/standard_data_format_mapping.py b/src/opentau/datasets/standard_data_format_mapping.py
@@ -188,13 +188,6 @@
         "prompt": "task",
         "response": "response",
     },
-    "pixmo": {
-        "camera0": "image",
-        "state": "state",
-        "actions": "actions",
-        "prompt": "prompt",
-        "response": "postfix",
-    },
     "dummy": {
         "camera0": "image",
         "state": "state",