# Parse Error Analysis (CORRECTED): Relationship with Code vs NL Trend

**IMPORTANT FIX**: The `code_parse_err` boolean field is misleading!
- Many records have `code_parse_err=True` but `code_err_msg="ok,ok"` AND `code_correct=True`
- We use `code_err_msg` to detect REAL errors:
  - "ok" or "ok,ok" → NO error
  - Empty string or other values → Error

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats as scipy_stats

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

RESULTS_DIR = Path("../results")

In [None]:
def has_real_error(err_msg):
    """Check if err_msg indicates a REAL error.
    
    Returns True if error, False if 'ok' (no error).
    """
    if not err_msg or err_msg == '':
        return True  # Empty = no response = error
    err_msg = str(err_msg).lower().strip()
    if err_msg in ['ok', 'ok,ok']:
        return False  # "ok" means no error
    return True

# Load all results
all_rows = []
for jsonl_path in RESULTS_DIR.rglob("res.jsonl"):
    with open(jsonl_path) as f:
        for line in f:
            try:
                row = json.loads(line)
                model = row.get('model', '').replace('openrouter/', '').replace('anthropic/', '').replace('google/', '').replace('openai/', '')
                if not model:
                    continue
                all_rows.append({
                    'model': model,
                    'nl_correct': row.get('nl_correct', False),
                    'code_correct': row.get('code_correct', False),
                    'sim_correct': row.get('sim_correct', False),
                    'nl_err': has_real_error(row.get('nl_err_msg', '')),
                    'code_err': has_real_error(row.get('code_err_msg', '')),
                    'sim_err': has_real_error(row.get('sim_err_msg', '')),
                })
            except:
                continue

df = pd.DataFrame(all_rows)
print(f"Loaded {len(df):,} rows")

## Per-Model Statistics (Corrected)

In [None]:
# Compute per-model stats with CORRECTED error detection
model_stats = []
for model, group in df.groupby('model'):
    if len(group) < 50:
        continue
    stats = {
        'model': model,
        'n': len(group),
        'nl_acc': group['nl_correct'].mean() * 100,
        'code_acc': group['code_correct'].mean() * 100,
        'sim_acc': group['sim_correct'].mean() * 100,
        'nl_err%': group['nl_err'].mean() * 100,
        'code_err%': group['code_err'].mean() * 100,
        'sim_err%': group['sim_err'].mean() * 100,
    }
    stats['code_vs_nl'] = stats['code_acc'] - stats['nl_acc']
    model_stats.append(stats)

stats_df = pd.DataFrame(model_stats).sort_values('code_vs_nl', ascending=False)
print(f"Computed stats for {len(stats_df)} models")

In [None]:
# Display table
styled = stats_df.style.format({
    'nl_acc': '{:.1f}%', 'code_acc': '{:.1f}%', 'sim_acc': '{:.1f}%',
    'code_vs_nl': '{:+.1f}%',
    'nl_err%': '{:.1f}%', 'code_err%': '{:.1f}%', 'sim_err%': '{:.1f}%',
}).background_gradient(subset=['code_vs_nl'], cmap='RdYlGn', vmin=-50, vmax=50)
styled

## Correlation Analysis (Corrected)

In [None]:
correlations = {}

r, p = scipy_stats.pearsonr(stats_df['code_err%'], stats_df['code_vs_nl'])
correlations['Code Err% vs Code>NL'] = {'r': r, 'p': p}

r, p = scipy_stats.pearsonr(stats_df['nl_err%'], stats_df['code_vs_nl'])
correlations['NL Err% vs Code>NL'] = {'r': r, 'p': p}

print("="*60)
print("CORRELATION ANALYSIS (CORRECTED error detection)")
print("="*60)
for name, vals in correlations.items():
    sig = "***" if vals['p'] < 0.001 else "**" if vals['p'] < 0.01 else "*" if vals['p'] < 0.05 else ""
    print(f"{name:<30}: r = {vals['r']:+.3f}, p = {vals['p']:.4f} {sig}")
print("\n* p < 0.05, ** p < 0.01, *** p < 0.001")

## Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Plot 1: Code Error vs Code>NL
ax = axes[0]
ax.scatter(stats_df['code_err%'], stats_df['code_vs_nl'], s=stats_df['n']/30, alpha=0.6, c='steelblue')
z = np.polyfit(stats_df['code_err%'], stats_df['code_vs_nl'], 1)
x_line = np.linspace(0, 100, 100)
ax.plot(x_line, np.poly1d(z)(x_line), "r--", alpha=0.8, label=f'r={correlations["Code Err% vs Code>NL"]["r"]:.3f}')
ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
ax.set_xlabel('Code Error % (from err_msg)')
ax.set_ylabel('Code - NL Accuracy Gap (%)')
ax.set_title('Code Error vs Code>NL Gap')
ax.legend()

# Plot 2: NL Error vs Code>NL (SIGNIFICANT!)
ax = axes[1]
ax.scatter(stats_df['nl_err%'], stats_df['code_vs_nl'], s=stats_df['n']/30, alpha=0.6, c='forestgreen')
z = np.polyfit(stats_df['nl_err%'], stats_df['code_vs_nl'], 1)
ax.plot(x_line, np.poly1d(z)(x_line), "r--", alpha=0.8, label=f'r={correlations["NL Err% vs Code>NL"]["r"]:.3f}***')
ax.axhline(y=0, color='gray', linestyle='-', alpha=0.3)
ax.set_xlabel('NL Error % (from err_msg)')
ax.set_ylabel('Code - NL Accuracy Gap (%)')
ax.set_title('NL Error vs Code>NL Gap (SIGNIFICANT)')
ax.legend()

plt.tight_layout()
plt.savefig('parse_error_correlation_corrected.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Box plot by error category
def categorize_err(rate):
    if rate < 20: return '0-19% (Low)'
    elif rate < 50: return '20-49% (Moderate)'
    else: return '50%+ (High)'

stats_df['err_cat'] = stats_df['code_err%'].apply(categorize_err)

fig, ax = plt.subplots(figsize=(10, 6))
order = ['0-19% (Low)', '20-49% (Moderate)', '50%+ (High)']
colors = ['#2ecc71', '#f1c40f', '#e74c3c']

sns.boxplot(data=stats_df, x='err_cat', y='code_vs_nl', order=order, palette=colors, ax=ax)
sns.stripplot(data=stats_df, x='err_cat', y='code_vs_nl', order=order, color='black', alpha=0.5, ax=ax)

ax.axhline(y=0, color='gray', linestyle='--', alpha=0.7)
ax.set_xlabel('Code Error Category (Corrected)')
ax.set_ylabel('Code - NL Accuracy Gap (%)')
ax.set_title('Code>NL Gap by Error Category')

plt.tight_layout()
plt.savefig('parse_error_boxplot_corrected.png', dpi=150, bbox_inches='tight')
plt.show()

## Summary by Category

In [None]:
summary = stats_df.groupby('err_cat').agg({
    'model': 'count',
    'code_vs_nl': 'mean',
    'code_acc': 'mean',
    'nl_acc': 'mean'
}).round(1)
summary.columns = ['n_models', 'avg_code_vs_nl', 'avg_code_acc', 'avg_nl_acc']
summary

## Key Findings (Corrected)

In [None]:
print("="*80)
print("KEY FINDINGS (CORRECTED)")
print("="*80)

r_nl = correlations['NL Err% vs Code>NL']['r']
p_nl = correlations['NL Err% vs Code>NL']['p']
r_code = correlations['Code Err% vs Code>NL']['r']
p_code = correlations['Code Err% vs Code>NL']['p']

low_err_avg = stats_df[stats_df['code_err%'] < 20]['code_vs_nl'].mean()
high_err_avg = stats_df[stats_df['code_err%'] >= 50]['code_vs_nl'].mean()

print(f"""
1. SIGNIFICANT CORRELATION: NL Error% vs Code>NL
   r = {r_nl:+.3f}, p = {p_nl:.4f} ***
   Higher NL error → Higher Code>NL gap
   Interpretation: When NL fails, code can still work → Code advantage

2. NON-SIGNIFICANT: Code Error% vs Code>NL
   r = {r_code:+.3f}, p = {p_code:.4f}
   Code errors alone don't strongly predict the gap

3. BY ERROR CATEGORY:
   - Low code error (<20%):  avg Code>NL = {low_err_avg:+.1f}%
   - High code error (50%+): avg Code>NL = {high_err_avg:+.1f}%

4. BEST MODELS FOR CLEAN ANALYSIS (low code error <20%):
""")

low_err_models = stats_df[stats_df['code_err%'] < 20].sort_values('code_vs_nl', ascending=False)
for _, row in low_err_models.iterrows():
    print(f"   - {row['model']:<35} Code>NL: {row['code_vs_nl']:+.1f}%, Code Err: {row['code_err%']:.1f}%")

In [None]:
# Save
stats_df.to_csv('model_parse_error_stats_corrected.csv', index=False)
print("Saved to model_parse_error_stats_corrected.csv")