-
Notifications
You must be signed in to change notification settings - Fork 10
fix(eval): deduplicate blocked ContextBench rows #122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,5 +1,5 @@ | ||||||||||||||||||
| import { execFileSync, spawnSync } from 'node:child_process'; | ||||||||||||||||||
| import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; | ||||||||||||||||||
| import { appendFileSync, mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'; | ||||||||||||||||||
| import { tmpdir } from 'node:os'; | ||||||||||||||||||
| import path from 'node:path'; | ||||||||||||||||||
| import { describe, expect, it, vi } from 'vitest'; | ||||||||||||||||||
|
|
@@ -173,6 +173,31 @@ describe('ContextBench Phase 40 baseline runner', () => { | |||||||||||||||||
| } | ||||||||||||||||||
| }); | ||||||||||||||||||
|
|
||||||||||||||||||
| it('rejects duplicate primary baseline rows during validation', () => { | ||||||||||||||||||
| const sessionRoot = tempSessionRoot('phase41'); | ||||||||||||||||||
| try { | ||||||||||||||||||
| execFileSync('node', ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], { | ||||||||||||||||||
| encoding: 'utf8' | ||||||||||||||||||
| }); | ||||||||||||||||||
| const firstRow = readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8').trim().split('\n')[0]; | ||||||||||||||||||
| appendFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), `${firstRow}\n`, 'utf8'); | ||||||||||||||||||
|
|
||||||||||||||||||
| const result = spawnSync( | ||||||||||||||||||
| 'node', | ||||||||||||||||||
| ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], | ||||||||||||||||||
| { encoding: 'utf8' } | ||||||||||||||||||
| ); | ||||||||||||||||||
|
|
||||||||||||||||||
| expect(result.status).not.toBe(0); | ||||||||||||||||||
| expect(result.stderr).toContain('duplicate primary baseline row for reservation'); | ||||||||||||||||||
| } finally { | ||||||||||||||||||
| rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { | ||||||||||||||||||
| recursive: true, | ||||||||||||||||||
| force: true | ||||||||||||||||||
| }); | ||||||||||||||||||
| } | ||||||||||||||||||
| }); | ||||||||||||||||||
|
|
||||||||||||||||||
| it('creates fake-executor baseline attempt artifacts without scripting agent decisions', () => { | ||||||||||||||||||
| const sessionRoot = tempSessionRoot(); | ||||||||||||||||||
| const taskId = manifest.tasks[0].instance_id; | ||||||||||||||||||
|
Comment on lines
+199
to
203
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new test uses a bare
Suggested change
|
||||||||||||||||||
|
|
||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,6 +43,13 @@ function tempSessionRoot(phase: 'phase40' | 'phase41' = 'phase40'): string { | |
| ); | ||
| } | ||
|
|
||
| function readRows(sessionRoot: string): Array<{ status: string; scoring?: { fallbackReason?: string } }> { | ||
| return readFileSync(path.join(sessionRoot, 'run-manifest.jsonl'), 'utf8') | ||
| .trim() | ||
| .split('\n') | ||
| .map((line) => JSON.parse(line) as { status: string; scoring?: { fallbackReason?: string } }); | ||
| } | ||
|
|
||
| describe('ContextBench Phase 40 dirty-worktree snapshot', () => { | ||
| it('captures the current checkout before baseline runs with hashes and validation metadata', () => { | ||
| const sessionRoot = tempSessionRoot(); | ||
|
|
@@ -121,6 +128,47 @@ describe('ContextBench Phase 40 dirty-worktree snapshot', () => { | |
| } | ||
| }); | ||
|
|
||
| it('does not duplicate blocked missing-evidence rows when snapshot is rerun', () => { | ||
| const sessionRoot = tempSessionRoot('phase41'); | ||
| try { | ||
| execFileSync( | ||
| 'node', | ||
| ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], | ||
| { encoding: 'utf8' } | ||
| ); | ||
| const firstBlockedRows = readRows(sessionRoot).filter( | ||
| (row) => | ||
| row.status === 'setup_failed' && | ||
| row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:') | ||
| ); | ||
|
|
||
| execFileSync( | ||
| 'node', | ||
| ['scripts/contextbench-runner.mjs', '--baseline-snapshot', '--out', sessionRoot], | ||
| { encoding: 'utf8' } | ||
| ); | ||
| const secondBlockedRows = readRows(sessionRoot).filter( | ||
| (row) => | ||
| row.status === 'setup_failed' && | ||
| row.scoring?.fallbackReason?.startsWith('terminal_missing_evidence:') | ||
| ); | ||
| const validateOutput = execFileSync( | ||
| 'node', | ||
| ['scripts/contextbench-runner.mjs', '--baseline-validate', '--session', sessionRoot], | ||
| { encoding: 'utf8' } | ||
| ); | ||
|
|
||
| expect(firstBlockedRows).toHaveLength(20 * 2 * 3); | ||
| expect(secondBlockedRows).toHaveLength(firstBlockedRows.length); | ||
| expect(validateOutput).toContain('baseline session validation passed'); | ||
| } finally { | ||
| rmSync(path.dirname(path.dirname(path.dirname(path.dirname(sessionRoot)))), { | ||
| recursive: true, | ||
| force: true | ||
| }); | ||
| } | ||
| }); | ||
|
|
||
| it('refuses raw baseline artifacts outside the ignored benchmark-runs root', () => { | ||
| const outDir = mkdtempSync(path.join(tmpdir(), 'contextbench-invalid-out-')); | ||
|
Comment on lines
+162
to
173
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The |
||
| try { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the
cleanupSessionRoothelper function instead of callingrmSyncdirectly. This ensures consistency across tests and leverages the retry logic implemented in the helper to avoid potential race conditions during cleanup, especially on Windows environments.