From 7685414c64fe889ef0d1f97fadcf4ee154b7c665 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Sun, 12 Apr 2026 21:47:35 +0200
Subject: [PATCH 1/2] fix(benchmarks): make all comparator lanes cross-platform
 on Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All five comparator adapters in scripts/benchmark-comparators.mjs were
setup_failed on Windows 11 due to Unix-only shell constructs. This fixes
the root causes per-lane so EVAL-02 (real benchmark data) is achievable.

Changes by lane:
- raw Claude Code: drop `2>/dev/null` from checkInstalled, switch
  runRawClaudeCode() from execAsync (shell, brittle quoting) to
  execFileAsync (no shell), add `--output-format json`, raise timeout
  60s→120s, change pending_evidence fallback to hard setup_failed
- codebase-memory-mcp: replace `which ... 2>/dev/null || npx` with
  npx-only check (cross-platform), raise initTimeout 5s→10s
- jCodeMunch: replace hardcoded python3 with pythonCmd (python on
  Windows), use `python -m pip install`, raise initTimeout 8s→15s
- CodeGraphContext: same pythonCmd consistency fix as jCodeMunch
- GrepAI: replace `which grepai 2>/dev/null` with `grepai --version`

Adds execFile import + execFileAsync, adds pythonCmd platform constant.
---
 scripts/benchmark-comparators.mjs | 53 +++++++++++++++++--------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/scripts/benchmark-comparators.mjs b/scripts/benchmark-comparators.mjs
index 207ace5..843f069 100644
--- a/scripts/benchmark-comparators.mjs
+++ b/scripts/benchmark-comparators.mjs
@@ -14,12 +14,13 @@
 import path from 'path';
 import { fileURLToPath } from 'url';
 import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs';
-import { execSync, exec } from 'child_process';
+import { execSync, exec, execFile } from 'child_process';
 import { parseArgs } from 'util';
 import { promisify } from 'util';
 import { withManagedStdioClientSession } from './lib/managed-mcp-session.mjs';
 
 const execAsync = promisify(exec);
+const execFileAsync = promisify(execFile);
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const projectRoot = path.join(__dirname, '..');
@@ -96,15 +97,14 @@ function estimateTokens(bytes) {
  * - searchArgs(task): map frozen task to tool arguments
  * - extractPayload(result): extract string payload from MCP tool response
  */
+const pythonCmd = process.platform === 'win32' ? 'python' : 'python3';
+
 const COMPARATOR_ADAPTERS = [
   {
     name: 'codebase-memory-mcp',
     checkInstalled() {
       try {
-        // Installed via curl installer to ~/.local/bin or similar; also available via npx
-        execSync('which codebase-memory-mcp 2>/dev/null || npx --yes codebase-memory-mcp --version 2>/dev/null', {
-          stdio: 'pipe'
-        });
+        execSync('npx --yes codebase-memory-mcp --version', { stdio: 'pipe', timeout: 30000 });
         return true;
       } catch {
         return false;
@@ -124,7 +124,7 @@ const COMPARATOR_ADAPTERS = [
     serverCommand: 'npx',
     serverArgs: ['--yes', 'codebase-memory-mcp'],
     serverEnv: {},
-    initTimeout: 5000,
+    initTimeout: 10000,
     indexTool: null, // auto-indexes on first query
     searchTool: 'search_code',
     searchArgs(task) {
@@ -141,7 +141,7 @@ const COMPARATOR_ADAPTERS = [
     name: 'jCodeMunch',
     checkInstalled() {
       try {
-        execSync('python3 -c "import jcodemunch" 2>/dev/null', { stdio: 'pipe' });
+        execSync(`${pythonCmd} -c "import jcodemunch"`, { stdio: 'pipe' });
         return true;
       } catch {
         return false;
@@ -149,15 +149,15 @@ const COMPARATOR_ADAPTERS = [
     },
     async install() {
       try {
-        execSync('pip install jcodemunch-mcp', { stdio: 'pipe', timeout: 120000 });
+        execSync(`${pythonCmd} -m pip install jcodemunch-mcp`, { stdio: 'pipe', timeout: 120000 });
       } catch (err) {
         throw new Error(`jCodeMunch install failed: ${err.message}`);
       }
     },
-    serverCommand: 'python3',
+    serverCommand: pythonCmd,
     serverArgs: ['-m', 'jcodemunch.server'],
     serverEnv: {},
-    initTimeout: 8000,
+    initTimeout: 15000,
     indexTool: 'index_folder',
     indexArgs(rootPath) {
       return { path: path.resolve(rootPath) };
@@ -182,7 +182,7 @@ const COMPARATOR_ADAPTERS = [
     name: 'GrepAI',
     checkInstalled() {
       try {
-        execSync('which grepai 2>/dev/null', { stdio: 'pipe' });
+        execSync('grepai --version', { stdio: 'pipe' });
         return true;
       } catch {
         return false;
@@ -191,7 +191,7 @@ const COMPARATOR_ADAPTERS = [
     async install() {
       // GrepAI requires a Go binary + Ollama embedding provider. Likely setup_failed without Ollama.
       try {
-        execSync('which grepai', { stdio: 'pipe' });
+        execSync('grepai --version', { stdio: 'pipe' });
       } catch {
         throw new Error(
           'GrepAI requires Go binary installation (Homebrew: brew install yoanbernabeu/tap/grepai) ' +
@@ -220,7 +220,7 @@ const COMPARATOR_ADAPTERS = [
     name: 'CodeGraphContext',
     checkInstalled() {
       try {
-        execSync('python3 -c "import codegraphcontext" 2>/dev/null', { stdio: 'pipe' });
+        execSync(`${pythonCmd} -c "import codegraphcontext"`, { stdio: 'pipe' });
         return true;
       } catch {
         return false;
@@ -228,7 +228,7 @@ const COMPARATOR_ADAPTERS = [
     },
     async install() {
       try {
-        execSync('pip install codegraphcontext', { stdio: 'pipe', timeout: 120000 });
+        execSync(`${pythonCmd} -m pip install codegraphcontext`, { stdio: 'pipe', timeout: 120000 });
       } catch (err) {
         throw new Error(
           `CodeGraphContext install failed: ${err.message}. ` +
@@ -236,7 +236,7 @@ const COMPARATOR_ADAPTERS = [
         );
       }
     },
-    serverCommand: 'python3',
+    serverCommand: pythonCmd,
     serverArgs: ['-m', 'codegraphcontext.server'],
     serverEnv: {},
     initTimeout: 15000,
@@ -261,7 +261,7 @@ const COMPARATOR_ADAPTERS = [
     name: 'raw Claude Code',
     checkInstalled() {
       try {
-        execSync('claude --version 2>/dev/null', { stdio: 'pipe' });
+        execSync('claude --version', { stdio: 'pipe' });
         return true;
       } catch {
         return false;
@@ -269,8 +269,7 @@ const COMPARATOR_ADAPTERS = [
     },
     async install() {
       throw new Error(
-        'raw Claude Code baseline requires the Claude Code CLI (claude) to be installed and authenticated. ' +
-        'This is the manual-log-capture baseline — record as pending_evidence if claude CLI is unavailable.'
+        'raw Claude Code baseline requires the claude CLI. Install: npm install -g @anthropic-ai/claude-code'
       );
     },
     // raw Claude Code is not an MCP server; handled separately via claude -p
@@ -411,11 +410,17 @@ async function runRawClaudeCode(rootPath, tasks) {
 
     try {
       const prompt = `You are exploring a codebase at ${path.resolve(rootPath)}. Answer this question using only grep, glob, and read file operations: ${task.prompt}`;
-      const { stdout } = await execAsync(
-        `claude -p "${prompt.replace(/"/g, '\\"')}" --allowedTools "Read,Grep,Glob"`,
-        { timeout: 60000, cwd: path.resolve(rootPath) }
+      const { stdout } = await execFileAsync(
+        'claude',
+        ['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'],
+        { timeout: 120000, cwd: path.resolve(rootPath) }
       );
-      payload = stdout;
+      try {
+        const parsed = JSON.parse(stdout);
+        payload = parsed.result ?? stdout;
+      } catch {
+        payload = stdout;
+      }
     } catch (err) {
       if (err.code === 'ENOENT' || err.message?.includes('command not found')) {
         throw new Error('claude CLI not found');
@@ -510,8 +515,8 @@ async function runComparator(adapter, repoPaths, allFixtures) {
       } catch (err) {
         if (err.message.includes('claude CLI not found')) {
           return {
-            status: 'pending_evidence',
-            reason: 'claude CLI not available. Run manually with: claude -p "<task>" --allowedTools "Read,Grep,Glob"'
+            status: 'setup_failed',
+            reason: 'claude CLI not found — required for baseline. Install: npm install -g @anthropic-ai/claude-code'
           };
         }
         return { status: 'setup_failed', reason: err.message };

From a1e9d42ca2d95df862f7415caeb773ea8f406c94 Mon Sep 17 00:00:00 2001
From: PatrickSys <rossellocolompatrick@gmail.com>
Date: Mon, 13 Apr 2026 09:17:59 +0200
Subject: [PATCH 2/2] fix(benchmarks): resolve execFileAsync .cmd issue and
 drop dead exec import

On Windows, execFile does not use a shell and cannot resolve npm's .cmd
wrappers (e.g. claude.cmd). checkInstalled() succeeded via execSync (which
goes through cmd.exe) but runRawClaudeCode threw ENOENT, returning
setup_failed on the very platform the previous commit targeted.

Add shell: process.platform === 'win32' to the execFileAsync call so
cmd.exe is used on Windows (resolves .cmd) while POSIX keeps shell: false
(no injection risk, args are already an array).

Also removes the dead exec / execAsync imports left over from the
shell-interpolated execAsync refactor.
---
 scripts/benchmark-comparators.mjs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/benchmark-comparators.mjs b/scripts/benchmark-comparators.mjs
index 843f069..0fbe5ca 100644
--- a/scripts/benchmark-comparators.mjs
+++ b/scripts/benchmark-comparators.mjs
@@ -14,12 +14,11 @@
 import path from 'path';
 import { fileURLToPath } from 'url';
 import { readFileSync, writeFileSync, mkdirSync, existsSync } from 'fs';
-import { execSync, exec, execFile } from 'child_process';
+import { execSync, execFile } from 'child_process';
 import { parseArgs } from 'util';
 import { promisify } from 'util';
 import { withManagedStdioClientSession } from './lib/managed-mcp-session.mjs';
 
-const execAsync = promisify(exec);
 const execFileAsync = promisify(execFile);
 
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
@@ -413,7 +412,7 @@ async function runRawClaudeCode(rootPath, tasks) {
       const { stdout } = await execFileAsync(
         'claude',
         ['-p', prompt, '--output-format', 'json', '--allowedTools', 'Read,Grep,Glob'],
-        { timeout: 120000, cwd: path.resolve(rootPath) }
+        { timeout: 120000, cwd: path.resolve(rootPath), shell: process.platform === 'win32' }
       );
       try {
         const parsed = JSON.parse(stdout);