diff --git a/scripts/contextbench-runner.mjs b/scripts/contextbench-runner.mjs index 332542c..724f133 100644 --- a/scripts/contextbench-runner.mjs +++ b/scripts/contextbench-runner.mjs @@ -1191,6 +1191,11 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) { fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record]) ); const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task])); + const existingPrimaryKeys = new Set( + readManifestRowsIfPresent(sessionRoot) + .filter((row) => !row.scoring?.baselineArmId) + .map((row) => primaryReservationKey(row.lane_id, row.task_id, row.repeat_index)) + ); for (const reservation of reservations.filter( (slot) => slot.status === 'terminal_missing_evidence' )) { @@ -1198,6 +1203,8 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) { const task = tasksById.get(reservation.taskId); const evidence = evidenceByLane.get(reservation.laneId); if (!laneCard || !task || !evidence) continue; + const key = primaryReservationKey(laneCard.laneId, task.instance_id, reservation.repeatIndex); + if (existingPrimaryKeys.has(key)) continue; const runId = sanitize( `${laneCard.laneId}-${task.instance_id}-${reservation.repeatIndex}-missing-evidence` ); @@ -1260,6 +1267,7 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) { ) }) ); + existingPrimaryKeys.add(key); } } @@ -1887,6 +1895,10 @@ function runKey(laneId, taskId, repeatIndex, prefix = '') { return `${prefix}${laneId}:${taskId}:${repeatIndex}`; } +function primaryReservationKey(laneId, taskId, repeatIndex) { + return `${laneId}::${taskId}::${repeatIndex}`; +} + function existingRunKeys(sessionRoot) { return new Set( readManifestRowsIfPresent(sessionRoot).map((row) => @@ -3313,15 +3325,38 @@ function validateBaselineSession(args) { errors.push(`expected ${expectedSlots} reserved slots, found ${reservations.length}`); const rows = readManifestRowsIfPresent(sessionRoot); validateSessionPaths(sessionRoot, rows, errors); + const primaryRowCounts = new Map(); + for (const row of rows.filter((entry) => !entry.scoring?.baselineArmId)) { + const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index); + primaryRowCounts.set(key, (primaryRowCounts.get(key) ?? 0) + 1); + } + for (const [key, count] of primaryRowCounts) { + if (count > 1) errors.push(`duplicate primary baseline row for reservation ${key}`); + } const blockedReservations = reservations.filter( (slot) => slot.status === 'terminal_missing_evidence' ); + const blockedReservationKeys = new Set( + blockedReservations.map((slot) => primaryReservationKey(slot.laneId, slot.taskId, slot.repeatIndex)) + ); + const blockedRowKeys = new Set(); + const extraBlockedRowKeys = []; const blockedRows = rows.filter( (row) => - row.status === 'setup_failed' && ['grepai', 'codebase-memory-mcp'].includes(row.lane_id) + !row.scoring?.baselineArmId && + row.status === 'setup_failed' && + String(row.scoring?.fallbackReason ?? '').startsWith('terminal_missing_evidence:') ); - if (blockedRows.length !== blockedReservations.length) { - errors.push('terminal missing-evidence rows must be present for every blocked reservation'); + for (const row of blockedRows) { + const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index); + blockedRowKeys.add(key); + if (!blockedReservationKeys.has(key)) extraBlockedRowKeys.push(key); + } + const missingBlockedRowKeys = [...blockedReservationKeys].filter((key) => !blockedRowKeys.has(key)); + if (missingBlockedRowKeys.length > 0 || extraBlockedRowKeys.length > 0) { + errors.push( + `terminal missing-evidence rows must match blocked reservations exactly; missing=${missingBlockedRowKeys.length}, extra=${extraBlockedRowKeys.length}` + ); } if (errors.length > 0) throw new Error(`baseline session validation failed:\n- ${errors.join('\n- ')}`); diff --git a/tests/contextbench-baseline-runner.test.ts b/tests/contextbench-baseline-runner.test.ts index 76b7264..61064a4 100644 --- a/tests/contextbench-baseline-runner.test.ts +++ b/tests/contextbench-baseline-runner.test.ts @@ -1,5 +1,5 @@ import { execFileSync, spawnSync } from 'node:child_process'; -import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; +import { appendFileSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; import { tmpdir } from 'node:os'; import path from 'node:path'; import { describe, expect, it, vi } from 'vitest'; @@ -173,6 +173,31 @@ describe('ContextBench Phase 40 baseline runner', () => { } }); + it('rejects duplicate primary baseline rows during validation', () => { + const sessionRoot = tempSessionRoot('phase41'); + try { + execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { + encoding: 'utf8' + }); + const firstRow = readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8').trim().split('\n')[0]; + appendFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), `${firstRow}\n`, 'utf8'); + + const result = spawnSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { encoding: 'utf8' } + ); + + expect(result.status).not.toBe(0); + expect(result.stderr).toContain('duplicate primary baseline row for reservation'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => { const sessionRoot = tempSessionRoot(); const taskId = manifest.tasks[0].instance_id; diff --git a/tests/contextbench-baseline-snapshot.test.ts b/tests/contextbench-baseline-snapshot.test.ts index 6ab133b..8b0c30b 100644 --- a/tests/contextbench-baseline-snapshot.test.ts +++ b/tests/contextbench-baseline-snapshot.test.ts @@ -43,6 +43,13 @@ function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { ); } +function readRows(sessionRoot: string): Array<{ status: string; scoring?: { fallbackReason?: string } }> { + return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8') + .trim() + .split('\n') + .map((line) => JSON.parse(line) as { status: string; scoring?: { fallbackReason?: string } }); +} + describe('ContextBench Phase 40 dirty-worktree snapshot', () => { it('captures the current checkout before baseline runs with hashes and validation metadata', () => { const sessionRoot = tempSessionRoot(); @@ -121,6 +128,47 @@ describe('ContextBench Phase 40 dirty-worktree snapshot', () => { } }); + it('does not duplicate blocked missing-evidence rows when snapshot is rerun', () => { + const sessionRoot = tempSessionRoot('phase41'); + try { + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + const firstBlockedRows = readRows(sessionRoot).filter( + (row) => + row.status === 'setup_failed' && + row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:') + ); + + execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], + { encoding: 'utf8' } + ); + const secondBlockedRows = readRows(sessionRoot).filter( + (row) => + row.status === 'setup_failed' && + row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:') + ); + const validateOutput = execFileSync( + 'node', + ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], + { encoding: 'utf8' } + ); + + expect(firstBlockedRows).toHaveLength(20 * 2 * 3); + expect(secondBlockedRows).toHaveLength(firstBlockedRows.length); + expect(validateOutput).toContain('baseline session validation passed'); + } finally { + rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { + recursive: true, + force: true + }); + } + }); + it('refuses raw baseline artifacts outside the ignored benchmark-runs root', () => { const outDir = mkdtempSync(path.join(tmpdir(), 'contextbench-invalid-out-')); try {