From 7685414c64fe889ef0d1f97fadcf4ee154b7c665 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Sun, 12 Apr 2026 21:47:35 +0200 Subject: [PATCH 1/2] fix(benchmarks): make all comparator lanes cross-platform on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All five comparator adapters in scripts/benchmark-comparators.mjs were setup_failed on Windows 11 due to Unix-only shell constructs. This fixes the root causes per-lane so EVAL-02 (real benchmark data) is achievable. Changes by lane: - raw Claude Code: drop `2>/dev/null` from checkInstalled, switch runRawClaudeCode() from execAsync (shell, brittle quoting) to execFileAsync (no shell), add `--output-format json`, raise timeout 60s→120s, change pending_evidence fallback to hard setup_failed - codebase-memory-mcp: replace `which ... 2>/dev/null || npx` with npx-only check (cross-platform), raise initTimeout 5s→10s - jCodeMunch: replace hardcoded python3 with pythonCmd (python on Windows), use `python -m pip install`, raise initTimeout 8s→15s - CodeGraphContext: same pythonCmd consistency fix as jCodeMunch - GrepAI: replace `which grepai 2>/dev/null` with `grepai --version` Adds execFile import + execFileAsync, adds pythonCmd platform constant. --- scripts/benchmark-comparators.mjs | 53 +++++++++++++++++-------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/scripts/benchmark-comparators.mjs b/scripts/benchmark-comparators.mjs index 207ace5..843f069 100644 --- a/scripts/benchmark-comparators.mjs +++ b/scripts/benchmark-comparators.mjs @@ -14,12 +14,13 @@ import path from 'path'; import { fileURLToPath } from 'url'; import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs'; -import { execSync, exec } from 'child_process'; +import { execSync, exec, execFile } from 'child_process'; import { parseArgs } from 'util'; import { promisify } from 'util'; import { withManagedStdioClientSession } from './lib/managed-mcp-session.mjs'; const execAsync = promisify(exec); +const execFileAsync = promisify(execFile); const __dirname = path.dirname(fileURLToPath(import.meta.url)); const projectRoot = path.join(__dirname, '..'); @@ -96,15 +97,14 @@ function estimateTokens(bytes) { * - searchArgs(task): map frozen task to tool arguments * - extractPayload(result): extract string payload from MCP tool response */ +const pythonCmd = process.platform === 'win32' ? 'python' : 'python3'; + const COMPARATOR_ADAPTERS = [ { name: 'codebase-memory-mcp', checkInstalled() { try { - // Installed via curl installer to ~/.local/bin or similar; also available via npx - execSync('which codebase-memory-mcp 2>/dev/null || npx --yes codebase-memory-mcp --version 2>/dev/null', { - stdio: 'pipe' - }); + execSync('npx --yes codebase-memory-mcp --version', { stdio: 'pipe', timeout: 30000 }); return true; } catch { return false; @@ -124,7 +124,7 @@ const COMPARATOR_ADAPTERS = [ serverCommand: 'npx', serverArgs: ['--yes', 'codebase-memory-mcp'], serverEnv: {}, - initTimeout: 5000, + initTimeout: 10000, indexTool: null, // auto-indexes on first query searchTool: 'search_code', searchArgs(task) { @@ -141,7 +141,7 @@ const COMPARATOR_ADAPTERS = [ name: 'jCodeMunch', checkInstalled() { try { - execSync('python3 -c "import jcodemunch" 2>/dev/null', { stdio: 'pipe' }); + execSync(`${pythonCmd} -c "import jcodemunch"`, { stdio: 'pipe' }); return true; } catch { return false; @@ -149,15 +149,15 @@ const COMPARATOR_ADAPTERS = [ }, async install() { try { - execSync('pip install jcodemunch-mcp', { stdio: 'pipe', timeout: 120000 }); + execSync(`${pythonCmd} -m pip install jcodemunch-mcp`, { stdio: 'pipe', timeout: 120000 }); } catch (err) { throw new Error(`jCodeMunch install failed: ${err.message}`); } }, - serverCommand: 'python3', + serverCommand: pythonCmd, serverArgs: ['-m', 'jcodemunch.server'], serverEnv: {}, - initTimeout: 8000, + initTimeout: 15000, indexTool: 'index_folder', indexArgs(rootPath) { return { path: path.resolve(rootPath) }; @@ -182,7 +182,7 @@ const COMPARATOR_ADAPTERS = [ name: 'GrepAI', checkInstalled() { try { - execSync('which grepai 2>/dev/null', { stdio: 'pipe' }); + execSync('grepai --version', { stdio: 'pipe' }); return true; } catch { return false; @@ -191,7 +191,7 @@ const COMPARATOR_ADAPTERS = [ async install() { // GrepAI requires a Go binary + Ollama embedding provider. Likely setup_failed without Ollama. try { - execSync('which grepai', { stdio: 'pipe' }); + execSync('grepai --version', { stdio: 'pipe' }); } catch { throw new Error( 'GrepAI requires Go binary installation (Homebrew: brew install yoanbernabeu/tap/grepai) ' + @@ -220,7 +220,7 @@ const COMPARATOR_ADAPTERS = [ name: 'CodeGraphContext', checkInstalled() { try { - execSync('python3 -c "import codegraphcontext" 2>/dev/null', { stdio: 'pipe' }); + execSync(`${pythonCmd} -c "import codegraphcontext"`, { stdio: 'pipe' }); return true; } catch { return false; @@ -228,7 +228,7 @@ const COMPARATOR_ADAPTERS = [ }, async install() { try { - execSync('pip install codegraphcontext', { stdio: 'pipe', timeout: 120000 }); + execSync(`${pythonCmd} -m pip install codegraphcontext`, { stdio: 'pipe', timeout: 120000 }); } catch (err) { throw new Error( `CodeGraphContext install failed: ${err.message}. ` + @@ -236,7 +236,7 @@ const COMPARATOR_ADAPTERS = [ ); } }, - serverCommand: 'python3', + serverCommand: pythonCmd, serverArgs: ['-m', 'codegraphcontext.server'], serverEnv: {}, initTimeout: 15000, @@ -261,7 +261,7 @@ const COMPARATOR_ADAPTERS = [ name: 'raw Claude Code', checkInstalled() { try { - execSync('claude --version 2>/dev/null', { stdio: 'pipe' }); + execSync('claude --version', { stdio: 'pipe' }); return true; } catch { return false; @@ -269,8 +269,7 @@ const COMPARATOR_ADAPTERS = [ }, async install() { throw new Error( - 'raw Claude Code baseline requires the Claude Code CLI (claude) to be installed and authenticated. ' + - 'This is the manual-log-capture baseline — record as pending_evidence if claude CLI is unavailable.' + 'raw Claude Code baseline requires the claude CLI. Install: npm install -g @anthropic-ai/claude-code' ); }, // raw Claude Code is not an MCP server; handled separately via claude -p @@ -411,11 +410,17 @@ async function runRawClaudeCode(rootPath, tasks) { try { const prompt = `You are exploring a codebase at ${path.resolve(rootPath)}. Answer this question using only grep, glob, and read file operations: ${task.prompt}`; - const { stdout } = await execAsync( - `claude -p "${prompt.replace(/"/g, '\\"')}" --allowedTools "Read,Grep,Glob"`, - { timeout: 60000, cwd: path.resolve(rootPath) } + const { stdout } = await execFileAsync( + 'claude', + ['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'], + { timeout: 120000, cwd: path.resolve(rootPath) } ); - payload = stdout; + try { + const parsed = JSON.parse(stdout); + payload = parsed.result ?? stdout; + } catch { + payload = stdout; + } } catch (err) { if (err.code === 'ENOENT' || err.message?.includes('command not found')) { throw new Error('claude CLI not found'); @@ -510,8 +515,8 @@ async function runComparator(adapter, repoPaths, allFixtures) { } catch (err) { if (err.message.includes('claude CLI not found')) { return { - status: 'pending_evidence', - reason: 'claude CLI not available. Run manually with: claude -p "" --allowedTools "Read,Grep,Glob"' + status: 'setup_failed', + reason: 'claude CLI not found — required for baseline. Install: npm install -g @anthropic-ai/claude-code' }; } return { status: 'setup_failed', reason: err.message }; From a1e9d42ca2d95df862f7415caeb773ea8f406c94 Mon Sep 17 00:00:00 2001 From: PatrickSys Date: Mon, 13 Apr 2026 09:17:59 +0200 Subject: [PATCH 2/2] fix(benchmarks): resolve execFileAsync .cmd issue and drop dead exec import On Windows, execFile does not use a shell and cannot resolve npm's .cmd wrappers (e.g. claude.cmd). checkInstalled() succeeded via execSync (which goes through cmd.exe) but runRawClaudeCode threw ENOENT, returning setup_failed on the very platform the previous commit targeted. Add shell: process.platform === 'win32' to the execFileAsync call so cmd.exe is used on Windows (resolves .cmd) while POSIX keeps shell: false (no injection risk, args are already an array). Also removes the dead exec / execAsync imports left over from the shell-interpolated execAsync refactor. --- scripts/benchmark-comparators.mjs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/benchmark-comparators.mjs b/scripts/benchmark-comparators.mjs index 843f069..0fbe5ca 100644 --- a/scripts/benchmark-comparators.mjs +++ b/scripts/benchmark-comparators.mjs @@ -14,12 +14,11 @@ import path from 'path'; import { fileURLToPath } from 'url'; import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs'; -import { execSync, exec, execFile } from 'child_process'; +import { execSync, execFile } from 'child_process'; import { parseArgs } from 'util'; import { promisify } from 'util'; import { withManagedStdioClientSession } from './lib/managed-mcp-session.mjs'; -const execAsync = promisify(exec); const execFileAsync = promisify(execFile); const __dirname = path.dirname(fileURLToPath(import.meta.url)); @@ -413,7 +412,7 @@ async function runRawClaudeCode(rootPath, tasks) { const { stdout } = await execFileAsync( 'claude', ['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'], - { timeout: 120000, cwd: path.resolve(rootPath) } + { timeout: 120000, cwd: path.resolve(rootPath), shell: process.platform === 'win32' } ); try { const parsed = JSON.parse(stdout);