SoundMindsAI · SoundMindsAI · May 10, 2026 · May 10, 2026 · May 10, 2026 · May 10, 2026
@@ -41,6 +41,18 @@ WORKDIR /app
 # ---------------------------------------------------------------------------
 FROM base AS deps
 
+# pytrec_eval (added by infra_optuna_eval) ships as a sdist that compiles a
+# C extension on install — it has no Python 3.12 wheel. We install gcc +
+# python-dev headers here, then this whole stage is discarded (the runtime
+# stage copies only /app/.venv, not the build toolchain), so the final image
+# stays slim.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+        gcc \
+        g++ \
+        python3-dev \
+    && rm -rf /var/lib/apt/lists/*
+
 # Copy lockfile + project metadata first so dependency-only layer caches well.
 COPY pyproject.toml uv.lock README.md ./
 

@@ -106,10 +106,20 @@ backend/
     adapters/    engine adapters — protocol.py (SearchAdapter Protocol +
                  8 Pydantic types), elastic.py (ES + OpenSearch),
                  credentials.py, errors.py, health_cache.py
+    eval/        pytrec_eval scoring + Optuna runtime helpers (from
+                 infra_optuna_eval): types.py (SamplerKind/PrunerKind/
+                 TrialStatus Literals), scoring.py (score, frozensets,
+                 objective_metric_key, wire-name translation),
+                 optuna_runtime.py (build_storage / build_sampler /
+                 build_pruner / get_or_create_study),
+                 qrels_loader.py (MVP1 stub raising JudgmentsTableMissing
+                 — real impl lands with feat_llm_judgments)
     scripts/     operator entrypoints — seed_clusters.py
     llm/         OpenAI-compatible client + capability check
     git/         Git provider clients (lands with feat_github_pr_worker)
-  workers/       Arq WorkerSettings (functions=[] in MVP1)
+  workers/       Arq WorkerSettings + run_trial Arq job (trials.py from
+                 infra_optuna_eval) + on_startup/on_shutdown hooks that
+                 build/dispose Optuna RDBStorage once per worker
   tests/         unit / integration / contract layers
 ui/              Next.js 14 App Router (placeholder page in MVP1)
 migrations/      Alembic config + versions/ (0001 baseline + 0002 clusters

@@ -43,9 +43,15 @@ class Trial(Base):
         String(36), ForeignKey("studies.id", ondelete="CASCADE"), nullable=False
     )
     optuna_trial_number: Mapped[int] = mapped_column(Integer, nullable=False)
-    """Optuna's per-study trial number; idempotent across worker restarts
-    (re-running a trial_number doesn't create a duplicate row because
-    Optuna's ``study.ask()`` is idempotent on the trial number)."""
+    """Optuna's per-study trial number. Pre-assigned by ``feat_study_lifecycle``
+    Phase 2's orchestrator via ``study.ask().number`` (which also calls
+    ``trial.suggest_*`` to populate params) before enqueueing
+    ``run_trial(study_id, optuna_trial_number)``. The ``infra_optuna_eval``
+    worker does NOT call ``study.ask()`` — it loads the pre-assigned
+    in-flight trial via ``study.trials[optuna_trial_number]``. Idempotency
+    on ``(study_id, optuna_trial_number)`` is enforced by the worker
+    (app-row check + Optuna-side reconciliation) per
+    ``infra_optuna_eval/feature_spec.md`` §11."""
     params: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False)
     """The parameter combination Optuna's sampler picked for this trial."""
     primary_metric: Mapped[float | None] = mapped_column(Float, nullable=True)

@@ -9,9 +9,10 @@
 exists. Optuna's ``create_study(storage=...)`` then creates its tables on first
 use (no-op on subsequent runs).
 
-In MVP1 this is effectively a no-op stub since ``infra_optuna_eval`` hasn't
-shipped yet. Becomes load-bearing the moment the trial worker calls
-``optuna.create_study(storage="postgresql://.../optuna")`` for the first time.
+In MVP1 this prepares the schema namespace; ``infra_optuna_eval``'s worker
+boot triggers Optuna's lazy table creation on first ``RDBStorage`` use.
+``WorkerSettings.on_startup`` constructs the ``RDBStorage`` once per worker
+(spec FR-1); the schema must already exist at that point.
 """
 
 import logging

@@ -0,0 +1,184 @@
+"""Optuna study factory + sampler/pruner builders (infra_optuna_eval Story 2.1).
+
+Pure-Python wrappers around Optuna's ``optuna.create_study``,
+``RDBStorage``, ``TPESampler`` / ``RandomSampler``, and ``MedianPruner`` /
+``NopPruner``. Encapsulates spec §FR-1 (RDB schema isolation via
+``options=-csearch_path=optuna``) and spec §FR-2 (sampler / pruner defaults,
+key-presence-vs-absence semantics, explicit-override).
+
+URL composition is factored into the pure ``_compose_storage_url()`` helper
+so unit tests can verify it without constructing a real ``RDBStorage``
+(which may open a DB connection depending on the installed Optuna version
+— see spec FR-1/AC-1b for the "neither timing is guaranteed" clause).
+
+Optuna's ``RDBStorage`` is **synchronous**; callers from async contexts
+(the worker, integration tests) wrap usage in ``asyncio.to_thread()`` per
+the project Conventions in the implementation plan.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from urllib.parse import urlparse, urlunparse
+
+import optuna
+from optuna.pruners import BasePruner, MedianPruner, NopPruner
+from optuna.samplers import BaseSampler, RandomSampler, TPESampler
+
+# ---------------------------------------------------------------------------
+# Storage URL composition (pure)
+# ---------------------------------------------------------------------------
+
+_OPTUNA_SEARCH_PATH_OPTION = "options=-csearch_path=optuna"
+"""Postgres connection option that pins all CREATE/SELECT to the ``optuna`` schema."""
+
+
+def _compose_storage_url(database_url: str) -> str:
+    """Build the URL Optuna's ``RDBStorage`` should connect with.
+
+    Steps:
+
+    1. Strip the ``+asyncpg`` driver prefix (Optuna uses a sync engine).
+       Mirrors the conversion in ``backend/app/db/optuna_schema.py:41``.
+    2. Append ``options=-csearch_path=optuna`` to the query string so
+       all Optuna DDL/DML lands in the ``optuna.*`` namespace (per spec
+       FR-1 + the operational invariant from
+       ``docs/01_architecture/optimization.md``).
+
+    Idempotent: if the option already appears in the URL, the URL is
+    returned unchanged.
+    """
+    sync_url = database_url.replace("postgresql+asyncpg://", "postgresql://")
+    parsed = urlparse(sync_url)
+    existing_query = parsed.query
+
+    if _OPTUNA_SEARCH_PATH_OPTION in existing_query:
+        return sync_url
+
+    new_query = (
+        f"{existing_query}&{_OPTUNA_SEARCH_PATH_OPTION}"
+        if existing_query
+        else _OPTUNA_SEARCH_PATH_OPTION
+    )
+    return urlunparse(
+        (
+            parsed.scheme,
+            parsed.netloc,
+            parsed.path,
+            parsed.params,
+            new_query,
+            parsed.fragment,
+        )
+    )
+
+
+def build_storage(database_url: str) -> optuna.storages.RDBStorage:
+    """Construct an ``RDBStorage`` against the same Postgres as the app DB.
+
+    Whether construction opens a DB connection or defers it to first use
+    is an Optuna implementation detail — spec FR-1/AC-1b explicitly does
+    not constrain the trigger. Callers in async contexts MUST wrap the
+    call in ``asyncio.to_thread()``.
+    """
+    return optuna.storages.RDBStorage(url=_compose_storage_url(database_url))
+
+
+# ---------------------------------------------------------------------------
+# Sampler / pruner builders (spec §FR-2 contract)
+# ---------------------------------------------------------------------------
+
+
+def build_sampler(config: dict[str, Any], *, seed: int | None) -> BaseSampler:
+    """Build the Optuna sampler from ``studies.config``.
+
+    Spec §FR-2:
+
+    * ``"sampler"`` key absent → ``TPESampler(seed=seed)`` (MVP1 default).
+    * ``config["sampler"] == "tpe"`` → ``TPESampler(seed=seed)``.
+    * ``config["sampler"] == "random"`` → ``RandomSampler(seed=seed)``
+      (baseline-comparison option per spec §3).
+
+    Raises:
+        ValueError: on any other value (CMA-ES, hyperband, etc. are reserved
+        for MVP2 per spec §3 Out of scope).
+    """
+    sampler = config.get("sampler", "tpe")
+    if sampler == "tpe":
+        return TPESampler(seed=seed)
+    if sampler == "random":
+        return RandomSampler(seed=seed)
+    raise ValueError(
+        f"unsupported sampler {sampler!r}; MVP1 allows: ['tpe', 'random'] "
+        f"(CMA-ES reserved for MVP2 per spec §3)"
+    )
+
+
+def build_pruner(config: dict[str, Any]) -> BasePruner:
+    """Build the Optuna pruner from ``studies.config``.
+
+    Spec §FR-2 two-pronged contract:
+
+    * ``"pruner"`` key **absent** AND ``config["max_trials"] < 50`` →
+      ``NopPruner`` (safeguard — small studies don't get enough TPE warmup).
+    * ``"pruner"`` key **absent** AND ``config["max_trials"] >= 50`` →
+      ``MedianPruner(n_warmup_steps=10)`` (MVP1 default).
+    * ``config["pruner"] == "median"`` **explicit** → ``MedianPruner(n_warmup_steps=10)``
+      regardless of ``max_trials`` (operator-override per spec FR-2 AC-6b).
+    * ``config["pruner"] == "none"`` → ``NopPruner``.
+
+    The data-contract distinction between "default-omitted" and "explicit-median"
+    is the key-presence signal in ``config``. Phase 2's API layer is required NOT
+    to materialize defaults into the stored row (per spec FR-2 last paragraph).
+
+    Raises:
+        ValueError: on any other ``pruner`` value, or if ``max_trials`` is
+        missing AND ``pruner`` is unspecified (we need ``max_trials`` to make
+        the safeguard decision).
+    """
+    if "pruner" in config:
+        pruner = config["pruner"]
+        if pruner == "median":
+            return MedianPruner(n_warmup_steps=10)
+        if pruner == "none":
+            return NopPruner()
+        raise ValueError(f"unsupported pruner {pruner!r}; MVP1 allows: ['median', 'none']")
+
+    # Default-omitted: depends on max_trials.
+    max_trials = config.get("max_trials")
+    if not isinstance(max_trials, int):
+        raise ValueError(
+            "config.max_trials is required when pruner is unspecified "
+            "(needed to apply the FR-2 small-study auto-disable safeguard); "
+            f"got {type(max_trials).__name__}"
+        )
+    if max_trials < 50:
+        return NopPruner()
+    return MedianPruner(n_warmup_steps=10)
+
+
+# ---------------------------------------------------------------------------
+# Study factory
+# ---------------------------------------------------------------------------
+
+
+def get_or_create_study(
+    *,
+    storage: optuna.storages.RDBStorage,
+    optuna_study_name: str,
+    direction: str,
+    sampler: BaseSampler,
+    pruner: BasePruner,
+) -> optuna.Study:
+    """Load the Optuna study by name, or create it.
+
+    Thin wrapper over ``optuna.create_study(load_if_exists=True, ...)``.
+    Synchronous — wrap callers in ``asyncio.to_thread()`` from async code.
+    """
+    return optuna.create_study(
+        storage=storage,
+        study_name=optuna_study_name,
+        direction=direction,
+        sampler=sampler,
+        pruner=pruner,
+        load_if_exists=True,
+    )
@@ -0,0 +1,81 @@
+"""Qrels loader interface (infra_optuna_eval Story 2.2).
+
+Single import point for the ``run_trial`` worker to fetch judgment ratings
+(qrels) for scoring. In MVP1 this is a typed stub that raises
+``JudgmentsTableMissing`` because the ``judgments`` child table is owned by
+``feat_llm_judgments`` (per ``docs/01_architecture/data-model.md`` §"judgment_lists
+and judgments") and has not shipped yet.
+
+Why a stub, not a real ``SELECT``:
+
+* Spec §9 explicitly forbids new tables in this feature.
+* Spec §3 In/Out of scope: judgment generation is owned by
+  ``feat_llm_judgments``; this feature does NOT generate or persist
+  judgments.
+* The only production callers of ``run_trial`` are
+  ``feat_study_lifecycle`` Phase 2's orchestrator and (indirectly)
+  ``feat_llm_judgments``'s runner — both deferred. There is no MVP1
+  dispatch path that would hit this stub in production. Premature dispatch
+  (e.g., an operator manually enqueueing ``run_trial`` against ``arq``)
+  fails loud with a clear typed exception rather than a confusing
+  ``UndefinedTable`` SQL error.
+
+When ``feat_llm_judgments`` lands, that feature's plan replaces this
+stub with a real ``SELECT`` against the ``judgments`` table::
+
+    SELECT query_id, doc_id, rating
+    FROM judgments
+    WHERE judgment_list_id = :judgment_list_id
+
+grouped by ``query_id``. Integration tests for THIS feature monkeypatch
+``load_qrels`` to inject hand-built qrels (per spec AC-4).
+"""
+
+from __future__ import annotations
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from backend.app.eval.scoring import Qrels
+
+
+class JudgmentsTableMissing(RuntimeError):
+    """Raised in MVP1 when ``run_trial`` attempts to load qrels.
+
+    The ``judgments`` table is owned by ``feat_llm_judgments`` (per
+    ``docs/01_architecture/data-model.md`` §"judgment_lists and judgments")
+    and has not shipped yet. When ``feat_llm_judgments`` lands, ``load_qrels``
+    is replaced with a real ``SELECT`` and this exception class becomes
+    unreachable.
+
+    Integration tests in ``backend/tests/integration/test_run_trial*.py``
+    monkeypatch ``load_qrels`` to return hand-built qrels (per spec AC-4
+    "hand-built judgment list"), so the stub does not block test coverage
+    of the ``run_trial`` runtime contract.
+    """
+
+
+async def load_qrels(db: AsyncSession, judgment_list_id: str) -> Qrels:
+    """Load qrels for a judgment list.
+
+    MVP1: always raises ``JudgmentsTableMissing``. See module docstring for
+    the rationale. When ``feat_llm_judgments`` lands, the implementation is:
+
+        stmt = select(Judgment.query_id, Judgment.doc_id, Judgment.rating).where(
+            Judgment.judgment_list_id == judgment_list_id
+        )
+        # GROUP BY query_id into {query_id: {doc_id: rating}}
+
+    Args:
+        db: Async SQLAlchemy session. Unused in the MVP1 stub; signature
+            reserved for the real implementation.
+        judgment_list_id: UUIDv7 string referencing the ``judgment_lists``
+            parent row.
+
+    Raises:
+        JudgmentsTableMissing: always, in MVP1.
+    """
+    raise JudgmentsTableMissing(
+        f"judgments table not yet shipped (feat_llm_judgments owns it); "
+        f"judgment_list_id={judgment_list_id!r}. Integration tests must "
+        f"monkeypatch load_qrels with hand-built qrels."
+    )