# Code Execution vs NL: Noise Robustness Analysis

This notebook analyzes whether code execution is similar to or as good as NL (natural language) at handling noise for algorithmic tasks.

**Hypothesis**: Code-based reasoning is at least as robust to input noise as NL reasoning for algorithmic tasks.

In [1]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats

# Publication-quality settings
plt.rcParams["figure.dpi"] = 150
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["axes.labelsize"] = 14
plt.rcParams["axes.titlesize"] = 16
plt.rcParams["legend.fontsize"] = 12

sns.set_style("whitegrid")

## 1. Load Data

In [2]:
# Configure paths (search a few likely locations)
CANDIDATE_DIRS = [
    Path("../results_noise_code_vs_nl"),
    Path("../results_noise"),
]
FILES = []
for d in CANDIDATE_DIRS:
    if d.exists():
        FILES.extend(list(d.glob("*.json")))
FILES = sorted(set(FILES))
print(f"Found {len(FILES)} result files")
for f in FILES[:5]:
    print(f"  {f}")
if len(FILES) > 5:
    print(f"  ... and {len(FILES) - 5} more")

Found 1 result files
  ../results_noise/anthropic-claude-haiku-4.5_seed1_noise_20260107_231724.json


In [3]:
def load_noise_results(files):
    """Load all noise experiment results from JSON files."""
    rows = []
    summaries = []
    
    for fp in files:
        try:
            payload = json.loads(fp.read_text())
        except json.JSONDecodeError:
            print(f"Warning: Could not parse {fp}")
            continue
        
        if not payload:
            continue
        
        # Last element is summary metadata, rest are records
        *records, summary = payload
        if records:
            df = pd.DataFrame.from_records(records)
            # Add metadata from summary to each record
            df["model"] = summary.get("model", "unknown")
            df["seed"] = summary.get("seed", 0)
            df["source"] = fp.name
            rows.append(df)
        summaries.append(summary)
    
    data = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()
    meta = pd.DataFrame.from_records(summaries) if summaries else pd.DataFrame()
    return data, meta

# Load data
if not FILES:
    print("No noise result files found!")
    print("Generate results first via:")
    print("  bash src/exps_performance/scripts/noise_code_vs_nl.sh")
    print("Or:")
    print("  bash src/exps_performance/scripts/noise.sh")
    data = pd.DataFrame()
    meta = pd.DataFrame()
else:
    data, meta = load_noise_results(FILES)
    print(f"Loaded {len(data)} records from {len(FILES)} files")
    
# Show summary only if data exists and has required columns
if not data.empty and "model" in data.columns:
    print(f"Models: {sorted(data['model'].unique())}")
    print(f"Arms: {sorted(data['arm'].unique())}")
    print(f"Noise types: {sorted(data['noise_type'].unique())}")
    print(f"Sigma levels: {sorted(data['sigma'].unique())}")
    print(f"Problem kinds: {sorted(data['kind'].unique())}")
elif data.empty:
    print("DataFrame is empty - check if JSON files contain records")

Loaded 160 records from 1 files
Models: ['anthropic/claude-haiku-4.5']
Arms: ['code', 'controlsim', 'nl', 'sim']
Noise types: ['gaussian', 'uniform']
Sigma levels: [0.0, 0.25, 0.5, 0.75, 1.0]
Problem kinds: ['add', 'lcs', 'mul', 'sub']


In [4]:
# Basic cleaning/casting and filtering
if not data.empty:
    data["sigma"] = data["sigma"].astype(float)
    data["accuracy"] = data["accuracy"].astype(float)
    data["noise_type"] = data["noise_type"].astype(str)
    data["arm"] = data["arm"].astype(str)
    data["kind"] = data["kind"].astype(str)
    
    # Filter to Code and NL arms only
    data = data[data["arm"].isin(["nl", "code"])].copy()
    
    # Add problem category
    category_map = {
        "add": "Arithmetic", "sub": "Arithmetic", "mul": "Arithmetic",
        "lcs": "DP", "knap": "DP", "rod": "DP",
        "ilp_assign": "ILP", "ilp_prod": "ILP", "ilp_partition": "ILP",
        "tsp": "NP-Hard", "gcp": "NP-Hard", "spp": "NP-Hard",
        "bsp": "NP-Hard", "edp": "NP-Hard", "msp": "NP-Hard",
        "ksp": "NP-Hard", "tspd": "NP-Hard", "gcpd": "NP-Hard",
        "clrs30": "CLRS",
    }
    data["category"] = data["kind"].map(category_map).fillna("Other")
    
    print(f"Filtered to {len(data)} records (nl + code arms only)")
    display(data.head())
else:
    print("No data to process. Run experiments first.")

Filtered to 80 records (nl + code arms only)


Unnamed: 0,kind,accuracy,arm,noise_type,sigma,model,seed,source,category
0,add,1.0,nl,gaussian,0.0,anthropic/claude-haiku-4.5,1,anthropic-claude-haiku-4.5_seed1_noise_2026010...,Arithmetic
1,lcs,1.0,nl,gaussian,0.0,anthropic/claude-haiku-4.5,1,anthropic-claude-haiku-4.5_seed1_noise_2026010...,DP
2,mul,0.75,nl,gaussian,0.0,anthropic/claude-haiku-4.5,1,anthropic-claude-haiku-4.5_seed1_noise_2026010...,Arithmetic
3,sub,1.0,nl,gaussian,0.0,anthropic/claude-haiku-4.5,1,anthropic-claude-haiku-4.5_seed1_noise_2026010...,Arithmetic
12,add,1.0,code,gaussian,0.0,anthropic/claude-haiku-4.5,1,anthropic-claude-haiku-4.5_seed1_noise_2026010...,Arithmetic


## 2. Aggregated Accuracy vs Noise Level

In [None]:
if not data.empty:
    # Aggregate across all problem types, models, and seeds
    agg = data.groupby(["arm", "sigma"])["accuracy"].agg(["mean", "std", "count"]).reset_index()
    agg["se"] = agg["std"] / np.sqrt(agg["count"])

    fig, ax = plt.subplots(figsize=(8, 6))

    colors = {"nl": "#1f77b4", "code": "#ff7f0e"}
    labels = {"nl": "NL (Arm 1)", "code": "Code Exec (Arm 3)"}
    markers = {"nl": "o", "code": "s"}

    for arm in ["nl", "code"]:
        arm_data = agg[agg["arm"] == arm].sort_values("sigma")
        if arm_data.empty:
            continue
        ax.errorbar(
            arm_data["sigma"],
            arm_data["mean"],
            yerr=1.96 * arm_data["se"],
            label=labels[arm],
            color=colors[arm],
            marker=markers[arm],
            markersize=10,
            linewidth=2,
            capsize=5,
        )

    ax.set_xlabel("Noise Level (σ)")
    ax.set_ylabel("Accuracy")
    ax.set_ylim(0, 1.05)
    ax.legend(loc="lower left")
    ax.set_title("Code Execution vs NL: Accuracy under Noise")
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()
else:
    print("No data available for plotting.")

## 3. Accuracy by Noise Type

In [None]:
if not data.empty:
    noise_types = sorted(data["noise_type"].unique())
    n_types = len(noise_types)
    
    n_cols = min(3, n_types)
    n_rows = (n_types + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows), sharey=True, squeeze=False)
    axes = axes.flatten()

    colors = {"nl": "#1f77b4", "code": "#ff7f0e"}
    labels = {"nl": "NL", "code": "Code Exec"}

    for idx, noise_type in enumerate(noise_types):
        ax = axes[idx]
        subset = data[data["noise_type"] == noise_type]
        
        agg = subset.groupby(["arm", "sigma"])["accuracy"].agg(["mean", "std", "count"]).reset_index()
        agg["se"] = agg["std"] / np.sqrt(agg["count"])
        
        for arm in ["nl", "code"]:
            arm_data = agg[agg["arm"] == arm].sort_values("sigma")
            if arm_data.empty:
                continue
            ax.errorbar(
                arm_data["sigma"],
                arm_data["mean"],
                yerr=1.96 * arm_data["se"],
                label=labels[arm],
                color=colors[arm],
                marker="o" if arm == "nl" else "s",
                markersize=8,
                linewidth=2,
                capsize=4,
            )
        
        ax.set_title(noise_type.capitalize())
        ax.set_xlabel("σ")
        ax.set_ylim(0, 1.05)
        ax.grid(True, alpha=0.3)
        
        if idx == 0:
            ax.set_ylabel("Accuracy")
            ax.legend(loc="lower left")

    for idx in range(n_types, len(axes)):
        axes[idx].set_visible(False)

    plt.suptitle("Accuracy under Different Noise Types: Code Exec vs NL", y=1.02, fontsize=16)
    plt.tight_layout()
    plt.show()
else:
    print("No data available for plotting.")

## 4. Accuracy by Problem Category

In [None]:
if not data.empty:
    categories = sorted(data["category"].unique())
    n_cats = len(categories)

    n_cols = min(3, n_cats)
    n_rows = (n_cats + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows), sharey=True, squeeze=False)
    axes = axes.flatten()

    colors = {"nl": "#1f77b4", "code": "#ff7f0e"}
    labels = {"nl": "NL", "code": "Code Exec"}

    for idx, category in enumerate(categories):
        ax = axes[idx]
        subset = data[data["category"] == category]
        
        agg = subset.groupby(["arm", "sigma"])["accuracy"].agg(["mean", "std", "count"]).reset_index()
        agg["se"] = agg["std"] / np.sqrt(agg["count"])
        
        for arm in ["nl", "code"]:
            arm_data = agg[agg["arm"] == arm].sort_values("sigma")
            if arm_data.empty:
                continue
            ax.errorbar(
                arm_data["sigma"],
                arm_data["mean"],
                yerr=1.96 * arm_data["se"],
                label=labels[arm],
                color=colors[arm],
                marker="o" if arm == "nl" else "s",
                markersize=8,
                linewidth=2,
                capsize=4,
            )
        
        ax.set_title(category)
        ax.set_xlabel("σ")
        ax.set_ylim(0, 1.05)
        ax.grid(True, alpha=0.3)
        
        if idx == 0:
            ax.set_ylabel("Accuracy")
            ax.legend(loc="lower left")

    for idx in range(n_cats, len(axes)):
        axes[idx].set_visible(False)

    plt.suptitle("Accuracy by Problem Category: Code Exec vs NL", y=1.02, fontsize=16)
    plt.tight_layout()
    plt.show()
else:
    print("No data available for plotting.")

## 5. Statistical Tests

In [None]:
def run_statistical_tests(data: pd.DataFrame, delta: float = 0.05) -> pd.DataFrame:
    """
    Run statistical tests to determine if Code is "similar to or as good as" NL.
    
    Tests:
    1. Paired t-test: H0: mean(Code) = mean(NL)
    2. Wilcoxon signed-rank test: Non-parametric alternative
    3. Non-inferiority test: H0: mean(Code) - mean(NL) < -delta
    """
    results = []
    
    # Determine grouping columns based on what's available
    group_cols = ["kind"]
    if "model" in data.columns:
        group_cols.append("model")
    if "seed" in data.columns:
        group_cols.append("seed")
    
    for noise_type in data["noise_type"].unique():
        for sigma in sorted(data["sigma"].unique()):
            subset = data[(data["noise_type"] == noise_type) & (data["sigma"] == sigma)]
            
            # Pivot to get paired observations
            try:
                pivot = subset.pivot_table(
                    index=group_cols,
                    columns="arm",
                    values="accuracy"
                ).dropna()
            except Exception as e:
                print(f"Warning: Could not pivot for {noise_type}, sigma={sigma}: {e}")
                continue
            
            if len(pivot) < 3 or "nl" not in pivot.columns or "code" not in pivot.columns:
                continue
            
            nl_acc = pivot["nl"].values
            code_acc = pivot["code"].values
            diff = code_acc - nl_acc
            
            # Paired t-test
            t_stat, t_pval = stats.ttest_rel(code_acc, nl_acc)
            
            # Wilcoxon
            try:
                w_stat, w_pval = stats.wilcoxon(code_acc, nl_acc)
            except ValueError:
                w_stat, w_pval = np.nan, np.nan
            
            # Non-inferiority
            ni_t_stat, ni_pval_two = stats.ttest_1samp(diff + delta, 0)
            ni_pval = ni_pval_two / 2 if ni_t_stat > 0 else 1 - ni_pval_two / 2
            
            # Cohen's d
            pooled_std = np.sqrt((np.var(nl_acc, ddof=1) + np.var(code_acc, ddof=1)) / 2)
            cohens_d = np.mean(diff) / pooled_std if pooled_std > 0 else 0
            
            results.append({
                "noise_type": noise_type,
                "sigma": sigma,
                "n_pairs": len(pivot),
                "mean_nl": np.mean(nl_acc),
                "mean_code": np.mean(code_acc),
                "mean_diff": np.mean(diff),
                "std_diff": np.std(diff, ddof=1),
                "cohens_d": cohens_d,
                "t_pval": t_pval,
                "wilcoxon_pval": w_pval,
                "noninferiority_pval": ni_pval,
                "code_noninferior": ni_pval < 0.05,
            })
    
    return pd.DataFrame(results)

if not data.empty:
    stats_df = run_statistical_tests(data)
    if not stats_df.empty:
        display(stats_df)
    else:
        print("No statistical results - need both 'nl' and 'code' arms with sufficient data")
else:
    stats_df = pd.DataFrame()
    print("No data available for statistical tests.")

In [None]:
if not stats_df.empty:
    # Statistical summary heatmaps
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    # Mean difference
    pivot_diff = stats_df.pivot(index="noise_type", columns="sigma", values="mean_diff")
    sns.heatmap(pivot_diff, ax=axes[0], cmap="RdYlGn", center=0, annot=True, fmt=".3f",
                cbar_kws={"label": "Mean Diff (Code - NL)"})
    axes[0].set_title("Accuracy Difference: Code - NL")
    axes[0].set_xlabel("Noise Level (σ)")
    axes[0].set_ylabel("Noise Type")

    # Non-inferiority p-values
    pivot_pval = stats_df.pivot(index="noise_type", columns="sigma", values="noninferiority_pval")
    sns.heatmap(pivot_pval, ax=axes[1], cmap="RdYlGn_r", vmin=0, vmax=0.1, annot=True, fmt=".3f",
                cbar_kws={"label": "p-value"})
    axes[1].set_title("Non-inferiority Test (δ=0.05)")
    axes[1].set_xlabel("Noise Level (σ)")
    axes[1].set_ylabel("Noise Type")

    plt.tight_layout()
    plt.show()
else:
    print("No statistical results to plot.")

## 6. Effect Size Analysis

In [None]:
if not stats_df.empty:
    # Cohen's d heatmap
    fig, ax = plt.subplots(figsize=(8, 6))

    pivot_d = stats_df.pivot(index="noise_type", columns="sigma", values="cohens_d")
    sns.heatmap(pivot_d, ax=ax, cmap="RdYlGn", center=0, annot=True, fmt=".2f",
                cbar_kws={"label": "Cohen's d"}, vmin=-1, vmax=1)
    ax.set_title("Effect Size (Cohen's d): Code - NL\n(+ve = Code better)")
    ax.set_xlabel("Noise Level (σ)")
    ax.set_ylabel("Noise Type")

    plt.tight_layout()
    plt.show()
else:
    print("No statistical results to plot.")

## 7. Summary and Conclusions

In [None]:
if not stats_df.empty:
    print("=" * 60)
    print("SUMMARY")
    print("=" * 60)

    noninferior_count = stats_df["code_noninferior"].sum()
    total_tests = len(stats_df)
    print(f"\nNon-inferiority established in {noninferior_count}/{total_tests} conditions")
    print(f"(Code is within 5% of NL accuracy at α=0.05)")

    # Bonferroni correction
    bonferroni_alpha = 0.05 / total_tests if total_tests > 0 else 0.05
    bonferroni_pass = (stats_df["noninferiority_pval"] < bonferroni_alpha).sum()
    print(f"\nWith Bonferroni correction (α={bonferroni_alpha:.4f}):")
    print(f"  Non-inferiority established in {bonferroni_pass}/{total_tests} conditions")

    # Cases where Code outperforms NL
    better = stats_df[stats_df["mean_diff"] > 0]
    print(f"\nCode outperforms NL in {len(better)}/{total_tests} conditions")

    # Overall verdict
    print("\n" + "=" * 60)
    print("VERDICT")
    print("=" * 60)
    if total_tests == 0:
        print("INSUFFICIENT DATA: Need more test conditions to draw conclusions.")
    elif noninferior_count >= total_tests * 0.8:
        print("SUPPORTED: Code execution is similar to or as good as NL at handling noise.")
    elif noninferior_count >= total_tests * 0.5:
        print("PARTIALLY SUPPORTED: Code is non-inferior in majority of conditions.")
    else:
        print("NOT SUPPORTED: Code underperforms NL in majority of noise conditions.")
else:
    print("No data available for summary.")
    print("\nTo generate data, run:")
    print("  bash src/exps_performance/scripts/noise_code_vs_nl.sh")