Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions scripts/contextbench-runner.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,13 +1191,20 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) {
fixtures.laneSetupEvidence.records.map((record) => [record.laneId, record])
);
const tasksById = new Map(fixtures.manifest.tasks.map((task) => [task.instance_id, task]));
const existingPrimaryKeys = new Set(
readManifestRowsIfPresent(sessionRoot)
.filter((row) => !row.scoring?.baselineArmId)
.map((row) => primaryReservationKey(row.lane_id, row.task_id, row.repeat_index))
);
for (const reservation of reservations.filter(
(slot) => slot.status === 'terminal_missing_evidence'
)) {
const laneCard = cardsByLane.get(reservation.laneId);
const task = tasksById.get(reservation.taskId);
const evidence = evidenceByLane.get(reservation.laneId);
if (!laneCard || !task || !evidence) continue;
const key = primaryReservationKey(laneCard.laneId, task.instance_id, reservation.repeatIndex);
if (existingPrimaryKeys.has(key)) continue;
const runId = sanitize(
`${laneCard.laneId}-${task.instance_id}-${reservation.repeatIndex}-missing-evidence`
);
Expand Down Expand Up @@ -1260,6 +1267,7 @@ function writeBlockedRunRows(sessionRoot, fixtures, reservations) {
)
})
);
existingPrimaryKeys.add(key);
}
}

Expand Down Expand Up @@ -1887,6 +1895,10 @@ function runKey(laneId, taskId, repeatIndex, prefix = '') {
return `${prefix}${laneId}:${taskId}:${repeatIndex}`;
}

function primaryReservationKey(laneId, taskId, repeatIndex) {
return `${laneId}::${taskId}::${repeatIndex}`;
}

function existingRunKeys(sessionRoot) {
return new Set(
readManifestRowsIfPresent(sessionRoot).map((row) =>
Expand Down Expand Up @@ -3313,15 +3325,38 @@ function validateBaselineSession(args) {
errors.push(`expected ${expectedSlots} reserved slots, found ${reservations.length}`);
const rows = readManifestRowsIfPresent(sessionRoot);
validateSessionPaths(sessionRoot, rows, errors);
const primaryRowCounts = new Map();
for (const row of rows.filter((entry) => !entry.scoring?.baselineArmId)) {
const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index);
primaryRowCounts.set(key, (primaryRowCounts.get(key) ?? 0) + 1);
}
for (const [key, count] of primaryRowCounts) {
if (count > 1) errors.push(`duplicate primary baseline row for reservation ${key}`);
}
const blockedReservations = reservations.filter(
(slot) => slot.status === 'terminal_missing_evidence'
);
const blockedReservationKeys = new Set(
blockedReservations.map((slot) => primaryReservationKey(slot.laneId, slot.taskId, slot.repeatIndex))
);
const blockedRowKeys = new Set();
const extraBlockedRowKeys = [];
const blockedRows = rows.filter(
(row) =>
row.status === 'setup_failed' && ['grepai', 'codebase-memory-mcp'].includes(row.lane_id)
!row.scoring?.baselineArmId &&
row.status === 'setup_failed' &&
String(row.scoring?.fallbackReason ?? '').startsWith('terminal_missing_evidence:')
);
if (blockedRows.length !== blockedReservations.length) {
errors.push('terminal missing-evidence rows must be present for every blocked reservation');
for (const row of blockedRows) {
const key = primaryReservationKey(row.lane_id, row.task_id, row.repeat_index);
blockedRowKeys.add(key);
if (!blockedReservationKeys.has(key)) extraBlockedRowKeys.push(key);
}
const missingBlockedRowKeys = [...blockedReservationKeys].filter((key) => !blockedRowKeys.has(key));
if (missingBlockedRowKeys.length > 0 || extraBlockedRowKeys.length > 0) {
errors.push(
`terminal missing-evidence rows must match blocked reservations exactly; missing=${missingBlockedRowKeys.length}, extra=${extraBlockedRowKeys.length}`
);
}
if (errors.length > 0)
throw new Error(`baseline session validation failed:\n- ${errors.join('\n- ')}`);
Expand Down
27 changes: 26 additions & 1 deletion tests/contextbench-baseline-runner.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { execFileSync, spawnSync } from 'node:child_process';
import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
import { appendFileSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
import { tmpdir } from 'node:os';
import path from 'node:path';
import { describe, expect, it, vi } from 'vitest';
Expand Down Expand Up @@ -173,6 +173,31 @@ describe('ContextBench Phase 40 baseline runner', () => {
}
});

it('rejects duplicate primary baseline rows during validation', () => {
const sessionRoot = tempSessionRoot('phase41');
try {
execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], {
encoding: 'utf8'
});
const firstRow = readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8').trim().split('\n')[0];
appendFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), `${firstRow}\n`, 'utf8');

const result = spawnSync(
'node',
['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
{ encoding: 'utf8' }
);

expect(result.status).not.toBe(0);
expect(result.stderr).toContain('duplicate primary baseline row for reservation');
} finally {
rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
recursive: true,
force: true
});
Comment on lines +194 to +197
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Use the cleanupSessionRoot helper function instead of calling rmSync directly. This ensures consistency across tests and leverages the retry logic implemented in the helper to avoid potential race conditions during cleanup, especially on Windows environments.

      cleanupSessionRoot(sessionRoot);

}
});

it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => {
const sessionRoot = tempSessionRoot();
const taskId = manifest.tasks[0].instance_id;
Comment on lines +199 to 203
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Cleanup bypasses Windows retry/error handling

The new test uses a bare rmSync in finally without maxRetries, retryDelay, or a wrapping ignoreWindowsTempCleanupRace catch. The existing cleanupSessionRoot helper at line 59 already encapsulates all three — using the helper would keep cleanup consistent and avoid flaky failures on Windows where temp-dir handles can still be open when finally runs.

Suggested change
});
it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => {
const sessionRoot = tempSessionRoot();
const taskId = manifest.tasks[0].instance_id;
} finally {
cleanupSessionRoot(sessionRoot);
}

Expand Down
48 changes: 48 additions & 0 deletions tests/contextbench-baseline-snapshot.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string {
);
}

function readRows(sessionRoot: string): Array<{ status: string; scoring?: { fallbackReason?: string } }> {
return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8')
.trim()
.split('\n')
.map((line) => JSON.parse(line) as { status: string; scoring?: { fallbackReason?: string } });
}

describe('ContextBench Phase 40 dirty-worktree snapshot', () => {
it('captures the current checkout before baseline runs with hashes and validation metadata', () => {
const sessionRoot = tempSessionRoot();
Expand Down Expand Up @@ -121,6 +128,47 @@ describe('ContextBench Phase 40 dirty-worktree snapshot', () => {
}
});

it('does not duplicate blocked missing-evidence rows when snapshot is rerun', () => {
const sessionRoot = tempSessionRoot('phase41');
try {
execFileSync(
'node',
['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
{ encoding: 'utf8' }
);
const firstBlockedRows = readRows(sessionRoot).filter(
(row) =>
row.status === 'setup_failed' &&
row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:')
);

execFileSync(
'node',
['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot],
{ encoding: 'utf8' }
);
const secondBlockedRows = readRows(sessionRoot).filter(
(row) =>
row.status === 'setup_failed' &&
row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:')
);
const validateOutput = execFileSync(
'node',
['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot],
{ encoding: 'utf8' }
);

expect(firstBlockedRows).toHaveLength(20 * 2 * 3);
expect(secondBlockedRows).toHaveLength(firstBlockedRows.length);
expect(validateOutput).toContain('baseline session validation passed');
} finally {
rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), {
recursive: true,
force: true
});
}
});

it('refuses raw baseline artifacts outside the ignored benchmark-runs root', () => {
const outDir = mkdtempSync(path.join(tmpdir(), 'contextbench-invalid-out-'));
Comment on lines +162 to 173
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Hardcoded fixture count couples test to manifest config

20 * 2 * 3 embeds the exact task count, blocked-lane count, and repeat count from the phase41 fixture. Any fixture change will silently break this assertion without a clear failure message. Adding a comment naming the source of each factor would make the intent explicit and the breakage diagnosable.

The finally block also repeats a bare rmSync without the maxRetries/retryDelay options present in the cleanupSessionRoot helper used by other tests in this file.

try {
Expand Down
Loading