PatrickSys · PatrickSys · Apr 30, 2026 · Apr 30, 2026 · gemini-code-assist · Apr 30, 2026
diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs
@@ -1191,13 +1191,20 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) {
     fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record])
   );
   const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task]));
+  const existingPrimaryKeys = new Set(
+    readManifestRowsIfPresent(sessionRoot)
+      .filter((row) => !row.scoring?.baselineArmId)
+      .map((row) => primaryReservationKey(row.lane_id, row.task_id, row.repeat_index))
+  );
   for (const reservation of reservations.filter(
     (slot) => slot.status === 'terminal_missing_evidence'
   )) {
     const laneCard = cardsByLane.get(reservation.laneId);
     const task = tasksById.get(reservation.taskId);
     const evidence = evidenceByLane.get(reservation.laneId);
     if (!laneCard || !task || !evidence) continue;
+    const key = primaryReservationKey(laneCard.laneId, task.instance_id, reservation.repeatIndex);
+    if (existingPrimaryKeys.has(key)) continue;
     const runId = sanitize(
       `${laneCard.laneId}-${task.instance_id}-${reservation.repeatIndex}-missing-evidence`
     );
@@ -1260,6 +1267,7 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) {
         )
       })
     );
+    existingPrimaryKeys.add(key);
   }
 }
 
@@ -1887,6 +1895,10 @@ function runKey(laneId, taskId, repeatIndex, prefix = '') {
   return `${prefix}${laneId}:${taskId}:${repeatIndex}`;
 }
 
+function primaryReservationKey(laneId, taskId, repeatIndex) {
+  return `${laneId}::${taskId}::${repeatIndex}`;
+}
+
 function existingRunKeys(sessionRoot) {
   return new Set(
     readManifestRowsIfPresent(sessionRoot).map((row) =>
@@ -3313,15 +3325,38 @@ function validateBaselineSession(args) {
     errors.push(`expected ${expectedSlots} reserved slots, found ${reservations.length}`);
   const rows = readManifestRowsIfPresent(sessionRoot);
   validateSessionPaths(sessionRoot, rows, errors);
+  const primaryRowCounts = new Map();
+  for (const row of rows.filter((entry) => !entry.scoring?.baselineArmId)) {
+    const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index);
+    primaryRowCounts.set(key, (primaryRowCounts.get(key) ?? 0) + 1);
+  }
+  for (const [key, count] of primaryRowCounts) {
+    if (count > 1) errors.push(`duplicate primary baseline row for reservation ${key}`);
+  }
   const blockedReservations = reservations.filter(
     (slot) => slot.status === 'terminal_missing_evidence'
   );
+  const blockedReservationKeys = new Set(
+    blockedReservations.map((slot) => primaryReservationKey(slot.laneId, slot.taskId, slot.repeatIndex))
+  );
+  const blockedRowKeys = new Set();
+  const extraBlockedRowKeys = [];
   const blockedRows = rows.filter(
     (row) =>
-      row.status === 'setup_failed' && ['grepai', 'codebase-memory-mcp'].includes(row.lane_id)
+      !row.scoring?.baselineArmId &&
+      row.status === 'setup_failed' &&
+      String(row.scoring?.fallbackReason ?? '').startsWith('terminal_missing_evidence:')
   );
-  if (blockedRows.length !== blockedReservations.length) {
-    errors.push('terminal missing-evidence rows must be present for every blocked reservation');
+  for (const row of blockedRows) {
+    const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index);
+    blockedRowKeys.add(key);
+    if (!blockedReservationKeys.has(key)) extraBlockedRowKeys.push(key);
+  }
+  const missingBlockedRowKeys = [...blockedReservationKeys].filter((key) => !blockedRowKeys.has(key));
+  if (missingBlockedRowKeys.length > 0 || extraBlockedRowKeys.length > 0) {
+    errors.push(
+      `terminal missing-evidence rows must match blocked reservations exactly; missing=${missingBlockedRowKeys.length}, extra=${extraBlockedRowKeys.length}`
+    );
   }
   if (errors.length > 0)
     throw new Error(`baseline session validation failed:\n- ${errors.join('\n- ')}`);

diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts
@@ -1,5 +1,5 @@
 import { execFileSync, spawnSync } from 'node:child_process';
-import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
+import { appendFileSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import path from 'node:path';
 import { describe, expect, it, vi } from 'vitest';
@@ -173,6 +173,31 @@ describe('ContextBench Phase 40 baseline runner', () => {
     }
   });
 
+  it('rejects duplicate primary baseline rows during validation', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    try {
+      execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
+        encoding: 'utf8'
+      });
+      const firstRow = readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8').trim().split('\n')[0];
+      appendFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), `${firstRow}\n`, 'utf8');
+
+      const result = spawnSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        { encoding: 'utf8' }
+      );
+
+      expect(result.status).not.toBe(0);
+      expect(result.stderr).toContain('duplicate primary baseline row for reservation');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
   it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => {
     const sessionRoot = tempSessionRoot();
     const taskId = manifest.tasks[0].instance_id;
-  });
-
-  it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => {
-    const sessionRoot = tempSessionRoot();
-    const taskId = manifest.tasks[0].instance_id;
+    } finally {
+      cleanupSessionRoot(sessionRoot);
+    }
-  });
-
-  it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => {
-    const sessionRoot = tempSessionRoot();
-    const taskId = manifest.tasks[0].instance_id;
+    } finally {
+      cleanupSessionRoot(sessionRoot);
+    }

diff --git a/tests/contextbench-baseline-snapshot.test.ts b/tests/contextbench-baseline-snapshot.test.ts
@@ -43,6 +43,13 @@ function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
   );
 }
 
+function readRows(sessionRoot: string): Array<{ status: string; scoring?: { fallbackReason?: string } }> {
+  return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8')
+    .trim()
+    .split('\n')
+    .map((line) => JSON.parse(line) as { status: string; scoring?: { fallbackReason?: string } });
+}
+
 describe('ContextBench Phase 40 dirty-worktree snapshot', () => {
   it('captures the current checkout before baseline runs with hashes and validation metadata', () => {
     const sessionRoot = tempSessionRoot();
@@ -121,6 +128,47 @@ describe('ContextBench Phase 40 dirty-worktree snapshot', () => {
     }
   });
 
+  it('does not duplicate blocked missing-evidence rows when snapshot is rerun', () => {
+    const sessionRoot = tempSessionRoot('phase41');
+    try {
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      const firstBlockedRows = readRows(sessionRoot).filter(
+        (row) =>
+          row.status === 'setup_failed' &&
+          row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:')
+      );
+
+      execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
+        { encoding: 'utf8' }
+      );
+      const secondBlockedRows = readRows(sessionRoot).filter(
+        (row) =>
+          row.status === 'setup_failed' &&
+          row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:')
+      );
+      const validateOutput = execFileSync(
+        'node',
+        ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
+        { encoding: 'utf8' }
+      );
+
+      expect(firstBlockedRows).toHaveLength(20 * 2 * 3);
+      expect(secondBlockedRows).toHaveLength(firstBlockedRows.length);
+      expect(validateOutput).toContain('baseline session validation passed');
+    } finally {
+      rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
+        recursive: true,
+        force: true
+      });
+    }
+  });
+
   it('refuses raw baseline artifacts outside the ignored benchmark-runs root', () => {
     const outDir = mkdtempSync(path.join(tmpdir(), 'contextbench-invalid-out-'));
     try {