# Evolution Comparison Results Visualization

Visualizes statistical comparison of evolved strategies vs classical GNFS-inspired baselines.

**Data source**: JSON files exported via `python prototype.py --compare-baseline --export-comparison results/comparison.json`

**Analysis includes**:
- Fitness evolution trajectories (evolved vs 3 baselines)
- Statistical significance testing (Welch's t-test, p-values)
- Effect size analysis (Cohen's d)
- Convergence detection results
- Run-to-run consistency

In [None]:
# Setup and Imports
import json

import matplotlib.pyplot as plt
import numpy as np

# Plot styling
plt.style.use("seaborn-v0_8-darkgrid")
plt.rcParams["figure.figsize"] = (14, 7)
plt.rcParams["font.size"] = 11


# Helper function
def significance_stars(p_value):
    """Convert p-value to significance stars."""
    if p_value < 0.001:
        return "***"
    elif p_value < 0.01:
        return "**"
    elif p_value < 0.05:
        return "*"
    else:
        return "ns"

In [None]:
# Load Comparison Data
comparison_file = "../results/test_comparison.json"  # Adjust path as needed
with open(comparison_file) as f:
    data = json.load(f)

# Extract metadata with safe defaults
print("📊 Comparison Analysis")
print(f"Target number: {data.get('target_number', 'N/A')}")
print(f"Number of runs: {data.get('num_runs', 0)}")
print(f"Max generations: {data.get('max_generations', 0)}")
print(f"Population size: {data.get('population_size', 0)}")
print(f"Base seed: {data.get('base_seed', 'None')}")
print()

runs = data.get("runs", [])
analysis = data.get("analysis", {})
baseline_names = ["conservative", "balanced", "aggressive"]

## 1. Fitness Evolution Over Generations

Shows how evolved strategies improve across generations compared to fixed baseline strategies.

In [None]:
fig, ax = plt.subplots(figsize=(14, 7))

# Extract all evolved fitness trajectories
all_evolved = [run["evolved_fitness"] for run in runs]

# Check if we have any runs
if not all_evolved:
    print("⚠️  No runs found in data. Skipping fitness evolution plot.")
    plt.close()
else:
    max_gens = max((len(f) for f in all_evolved), default=0)
    generations = range(max_gens)

    # Plot individual runs (semi-transparent)
    for i, fitness in enumerate(all_evolved):
        gens = range(len(fitness))
        label = "_nolegend_" if i > 0 else "Evolved strategies"
        ax.plot(gens, fitness, "b-", alpha=0.25, linewidth=1, label=label)

    # Calculate and plot mean trajectory
    mean_trajectory = []
    std_trajectory = []
    for gen in generations:
        gen_values = [f[gen] for f in all_evolved if len(f) > gen]
        mean_trajectory.append(np.mean(gen_values))
        std_trajectory.append(np.std(gen_values))

    ax.plot(
        generations,
        mean_trajectory,
        "b-",
        linewidth=2.5,
        label="Evolved (mean)",
        zorder=10,
    )
    ax.fill_between(
        generations,
        np.array(mean_trajectory) - np.array(std_trajectory),
        np.array(mean_trajectory) + np.array(std_trajectory),
        alpha=0.2,
        color="blue",
        label="±1 std dev",
    )

    # Plot baseline horizontal lines
    baseline_colors = {
        "conservative": "#d62728",
        "balanced": "#ff7f0e",
        "aggressive": "#2ca02c",
    }
    for name in baseline_names:
        baseline_val = runs[0]["baseline_fitness"][name]
        ax.axhline(
            baseline_val,
            linestyle="--",
            linewidth=2,
            color=baseline_colors[name],
            alpha=0.8,
            label=f"{name.title()} baseline (fitness={baseline_val:.0f})",
        )

    # Mark convergence points
    for run in runs:
        if run["generations_to_convergence"] is not None:
            conv_gen = run["generations_to_convergence"]
            conv_fitness = run["evolved_fitness"][conv_gen]
            ax.plot(conv_gen, conv_fitness, "ro", markersize=6, alpha=0.5, zorder=5)

    # Add first convergence point to legend
    first_conv = next(
        (r for r in runs if r["generations_to_convergence"] is not None), None
    )
    if first_conv:
        ax.plot([], [], "ro", markersize=6, alpha=0.5, label="Convergence point")

    ax.set_xlabel("Generation", fontsize=12, fontweight="bold")
    ax.set_ylabel("Fitness (Smooth Candidates Found)", fontsize=12, fontweight="bold")
    ax.set_title(
        "Fitness Evolution: Evolved Strategies vs Classical Baselines",
        fontsize=14,
        fontweight="bold",
    )
    ax.legend(loc="lower right", fontsize=10)
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

## 2. Statistical Comparison

Bar chart showing mean fitness with confidence intervals and statistical significance.

In [None]:
fig, ax = plt.subplots(figsize=(12, 7))

x_pos = np.arange(len(baseline_names))
width = 0.35

# Extract means and CIs
evolved_means = [
    analysis["comparison_results"][name]["evolved_mean"] for name in baseline_names
]
baseline_means = [
    analysis["comparison_results"][name]["baseline_mean"] for name in baseline_names
]
p_values = [analysis["comparison_results"][name]["p_value"] for name in baseline_names]
effect_sizes = [
    analysis["comparison_results"][name]["effect_size"] for name in baseline_names
]
cis = [
    analysis["comparison_results"][name]["confidence_interval"]
    for name in baseline_names
]

# Calculate CI errors (distance from mean to bounds)
ci_errors = [[(mean - ci[0]), (ci[1] - mean)] for mean, ci in zip(evolved_means, cis)]
ci_lower = [e[0] for e in ci_errors]
ci_upper = [e[1] for e in ci_errors]

# Create bars
bars1 = ax.bar(
    x_pos - width / 2,
    evolved_means,
    width,
    label="Evolved",
    color="#1f77b4",
    alpha=0.8,
    yerr=[ci_lower, ci_upper],
    capsize=5,
)
bars2 = ax.bar(
    x_pos + width / 2,
    baseline_means,
    width,
    label="Baseline",
    color="#7f7f7f",
    alpha=0.6,
)

# Add improvement percentage and significance annotations
for i, (_name, evolved, baseline, p_val, effect) in enumerate(
    zip(baseline_names, evolved_means, baseline_means, p_values, effect_sizes)
):
    # Improvement percentage
    if baseline > 0:
        improvement = ((evolved - baseline) / baseline) * 100
        imp_text = f"+{improvement:.1f}%"
    else:
        imp_text = "+inf%"

    # Position text above taller bar
    y_pos = max(evolved, baseline) + (max(ci_upper) if ci_upper else 0) * 0.1

    # Annotation with significance and effect size
    sig = significance_stars(p_val)
    ax.text(
        i,
        y_pos,
        f"{imp_text} {sig}\n(d={effect:.2f})",
        ha="center",
        va="bottom",
        fontsize=9,
        fontweight="bold",
    )

ax.set_xlabel("Baseline Strategy", fontsize=12, fontweight="bold")
ax.set_ylabel("Mean Fitness", fontsize=12, fontweight="bold")
ax.set_title(
    "Statistical Comparison: Evolved vs Baseline Strategies",
    fontsize=14,
    fontweight="bold",
)
ax.set_xticks(x_pos)
ax.set_xticklabels([name.title() for name in baseline_names])
ax.legend(fontsize=11)
ax.grid(alpha=0.3, axis="y")

# Add significance legend
sig_text = "Significance: *** p<0.001, ** p<0.01, * p<0.05, ns = not significant"
ax.text(
    0.02,
    0.98,
    sig_text,
    transform=ax.transAxes,
    fontsize=9,
    verticalalignment="top",
    bbox={"boxstyle": "round", "facecolor": "wheat", "alpha": 0.3},
)

plt.tight_layout()
plt.show()

## 3. Effect Size Analysis

Quantifies practical significance using Cohen's d. Large effect sizes (d ≥ 0.8) indicate substantial improvements.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

effect_sizes = [
    analysis["comparison_results"][name]["effect_size"] for name in baseline_names
]

# Color code by effect size magnitude
colors = []
for d in effect_sizes:
    if abs(d) >= 0.8:
        colors.append("#2ca02c")  # Large - green
    elif abs(d) >= 0.5:
        colors.append("#ff7f0e")  # Medium - orange
    elif abs(d) >= 0.2:
        colors.append("#ffbb00")  # Small - yellow
    else:
        colors.append("#7f7f7f")  # Negligible - gray

bars = ax.barh(baseline_names, effect_sizes, color=colors, alpha=0.7)

# Add reference lines for Cohen's d thresholds
ax.axvline(0.2, color="gray", linestyle=":", alpha=0.5, label="Small (0.2)")
ax.axvline(0.5, color="gray", linestyle="--", alpha=0.5, label="Medium (0.5)")
ax.axvline(0.8, color="gray", linestyle="-", alpha=0.5, label="Large (0.8)")

# Annotate bars with values
for i, (_name, d) in enumerate(zip(baseline_names, effect_sizes)):
    label_pos = d + 0.1 if d > 0 else d - 0.1
    ha = "left" if d > 0 else "right"
    ax.text(label_pos, i, f"{d:.2f}", va="center", ha=ha, fontweight="bold")

ax.set_xlabel("Cohen's d (Effect Size)", fontsize=12, fontweight="bold")
ax.set_ylabel("Baseline Strategy", fontsize=12, fontweight="bold")
ax.set_title(
    "Effect Size Analysis: Practical Significance of Improvements",
    fontsize=14,
    fontweight="bold",
)
ax.set_yticklabels([name.title() for name in baseline_names])
ax.legend(loc="lower right", fontsize=10)
ax.grid(alpha=0.3, axis="x")
plt.tight_layout()
plt.show()

## 4. Convergence Analysis

Shows when fitness plateaus occur, indicating stable evolved strategies.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

# Extract convergence data
convergence_gens = [run["generations_to_convergence"] for run in runs]
converged = [g for g in convergence_gens if g is not None]
not_converged_count = len([g for g in convergence_gens if g is None])

# Histogram of convergence generations
if converged:
    bins = (
        range(min(converged), max(converged) + 2)
        if len(set(converged)) > 1
        else [min(converged), min(converged) + 1]
    )
    ax.hist(converged, bins=bins, color="#1f77b4", alpha=0.7, edgecolor="black")

    # Mean convergence line
    mean_conv = np.mean(converged)
    ax.axvline(
        mean_conv,
        color="red",
        linestyle="--",
        linewidth=2,
        label=f"Mean convergence: {mean_conv:.1f} gen",
    )

# Statistics text
conv_stats = analysis["convergence_stats"]
stats_text = (
    f"Convergence Rate: {conv_stats['convergence_rate'] * 100:.0f}%\n"
    f"Mean: {conv_stats['mean']:.1f} ± {conv_stats['std']:.1f} generations\n"
    f"Did not converge: {not_converged_count} runs"
)

ax.text(
    0.98,
    0.98,
    stats_text,
    transform=ax.transAxes,
    fontsize=11,
    verticalalignment="top",
    horizontalalignment="right",
    bbox={"boxstyle": "round", "facecolor": "lightblue", "alpha": 0.5},
)

ax.set_xlabel("Generation Number", fontsize=12, fontweight="bold")
ax.set_ylabel("Number of Runs", fontsize=12, fontweight="bold")
ax.set_title(
    "Convergence Analysis: When Does Fitness Plateau?", fontsize=14, fontweight="bold"
)
if converged:
    ax.legend(fontsize=11)
ax.grid(alpha=0.3, axis="y")
plt.tight_layout()
plt.show()

## 5. Improvement Distribution

Box plots show consistency of improvement across multiple independent runs.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

# Calculate improvement for each run and each baseline
improvements_by_baseline = {name: [] for name in baseline_names}

for run in runs:
    final_fitness = run["evolved_fitness"][-1]
    for name in baseline_names:
        baseline_val = run["baseline_fitness"][name]
        if baseline_val > 0:
            improvement_pct = ((final_fitness - baseline_val) / baseline_val) * 100
        else:
            improvement_pct = float("inf") if final_fitness > 0 else 0

        # Cap infinite values for display
        if improvement_pct == float("inf"):
            improvement_pct = 1000  # Display as 1000% for visualization

        improvements_by_baseline[name].append(improvement_pct)

# Create box plot
positions = range(1, len(baseline_names) + 1)
bp = ax.boxplot(
    [improvements_by_baseline[name] for name in baseline_names],
    positions=positions,
    widths=0.6,
    patch_artist=True,
    boxprops={"facecolor": "lightblue", "alpha": 0.7},
    medianprops={"color": "red", "linewidth": 2},
)

# Add individual points
for i, name in enumerate(baseline_names, 1):
    y = improvements_by_baseline[name]
    x = np.random.normal(i, 0.04, size=len(y))
    ax.plot(x, y, "o", alpha=0.4, color="navy", markersize=6)

ax.axhline(0, color="gray", linestyle="--", alpha=0.5, label="No improvement")
ax.set_xticks(positions)
ax.set_xticklabels([name.title() for name in baseline_names])
ax.set_xlabel("Baseline Strategy", fontsize=12, fontweight="bold")
ax.set_ylabel("Improvement (%)", fontsize=12, fontweight="bold")
ax.set_title("Improvement Distribution Across Runs", fontsize=14, fontweight="bold")
ax.grid(alpha=0.3, axis="y")
ax.legend(fontsize=11)

# Add note about infinite values
if any(1000 in improvements_by_baseline[name] for name in baseline_names):
    note_text = (
        "Note: Baseline=0 cases shown as 1000% for visualization (actually +inf%)"
    )
    ax.text(
        0.5,
        0.02,
        note_text,
        transform=ax.transAxes,
        fontsize=9,
        ha="center",
        va="bottom",
        style="italic",
        bbox={"boxstyle": "round", "facecolor": "yellow", "alpha": 0.3},
    )

plt.tight_layout()
plt.show()

## 6. Summary Statistics

Complete statistical report for all baseline comparisons.

In [None]:
print("=" * 80)

print("SUMMARY STATISTICS TABLE")

print("=" * 80)

print()


for name in baseline_names:
    result = analysis["comparison_results"][name]

    print(f"{name.upper()} BASELINE:")

    print(f"  Evolved mean:  {result['evolved_mean']:.1f}")

    print(f"  Baseline mean: {result['baseline_mean']:.1f}")

    if result["baseline_mean"] > 0:
        improvement = (
            (result["evolved_mean"] - result["baseline_mean"]) / result["baseline_mean"]
        ) * 100

        print(f"  Improvement:   +{improvement:.1f}%")

    else:
        print("  Improvement:   +inf%")

    sig = "✓ YES" if result["is_significant"] else "✗ NO"

    print(f"  p-value:       {result['p_value']:.4f} ({sig})")

    print(f"  Effect size:   {result['effect_size']:.2f} (Cohen's d)")

    print(
        f"  95% CI:        [{result['confidence_interval'][0]:.1f}, {result['confidence_interval'][1]:.1f}]"
    )

    print()


print("=" * 80)

print("CONVERGENCE STATISTICS:")

print("=" * 80)

conv = analysis["convergence_stats"]

print(
    f"  Convergence rate: {conv['convergence_rate'] * 100:.0f}% ({int(conv['convergence_rate'] * data['num_runs'])}/{data['num_runs']} runs)"
)

print(f"  Mean generations: {conv['mean']:.1f} ± {conv['std']:.1f}")

print()

## 📊 How to Interpret Results

### Statistical Significance (p-value)
- **p < 0.05**: Statistically significant improvement (marked with *)
- **p < 0.01**: Highly significant (**)
- **p < 0.001**: Very highly significant (***)
- **p ≥ 0.05**: Not statistically significant (ns) - difference could be due to chance

### Effect Size (Cohen's d)
- **d < 0.2**: Negligible practical difference
- **0.2 ≤ d < 0.5**: Small but meaningful improvement
- **0.5 ≤ d < 0.8**: Medium improvement - noticeable in practice
- **d ≥ 0.8**: Large improvement - substantial practical significance

### Confidence Interval (95% CI)
- Range where the true difference likely falls
- If CI excludes 0, the improvement is statistically significant
- Wider CI = more uncertainty (fewer runs or high variance)

### Convergence
- **High convergence rate** (>70%): Evolution reliably finds stable solutions
- **Low mean generations**: Fast convergence - efficient optimization
- **Non-convergence**: Fitness still improving - may need more generations

### Key Questions to Ask
1. Did evolution beat all baselines significantly?
2. Are effect sizes large enough to matter practically?
3. Is improvement consistent across runs (narrow box plots)?
4. Does convergence happen early enough for efficiency?