# C1 Validation Exploration Notebook

Interactive analysis of C1 validation results for Hypothesis H1a.

**Hypothesis**: Collaborative mode with feedback-guided mutations outperforms baselines.

**Status**: ❌ NOT SUPPORTED (emergence=0.954, p=0.579, d=-0.58)

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats

# Set style for publication-quality plots
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["font.size"] = 11

## 1. Load Data

In [None]:
# Load statistical analysis results
data_path = Path("../results/c1_validation/h1a_analysis.json")
with open(data_path) as f:
    data = json.load(f)

# Extract fitness data
collaborative = np.array(data["collaborative_fitness"])
search_only = np.array(data["search_only_fitness"])
rulebased = np.array(data["rulebased_fitness"])

# Extract metrics
metrics = data["metrics"]
stats_tests = data["statistical_tests"]
ci = data["confidence_intervals"]
h1a_criteria = data["h1a_criteria"]

print("✓ Data loaded successfully")
print(f"  Collaborative runs: {len(collaborative)}")
print(f"  Search-only runs: {len(search_only)}")
print(f"  Rule-based runs: {len(rulebased)}")

## 2. Descriptive Statistics

In [None]:
def describe_mode(data, name):
    """Print descriptive statistics for a mode."""
    print(f"\n{name}:")
    print(f"  Mean: {data.mean():.1f}")
    print(f"  Median: {np.median(data):.1f}")
    print(f"  Std Dev: {data.std(ddof=1):.1f}")
    print(f"  Min: {data.min():.1f}")
    print(f"  Max: {data.max():.1f}")
    print(f"  Range: {data.max() - data.min():.1f}")
    print(f"  CV: {(data.std(ddof=1) / data.mean()) * 100:.1f}%")


describe_mode(collaborative, "Collaborative")
describe_mode(search_only, "Search-only")
describe_mode(rulebased, "Rule-based")

## 3. Distribution Analysis

In [None]:
# Create comparison plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Histogram comparison
axes[0].hist(
    [collaborative, search_only, rulebased],
    label=["Collaborative", "Search-only", "Rule-based"],
    bins=8,
    alpha=0.6,
    edgecolor="black",
)
axes[0].set_xlabel("Final Fitness")
axes[0].set_ylabel("Frequency")
axes[0].set_title("Distribution Comparison")
axes[0].legend()

# Box plot
axes[1].boxplot(
    [collaborative, search_only, rulebased],
    labels=["Collaborative", "Search-only", "Rule-based"],
)
axes[1].set_ylabel("Final Fitness")
axes[1].set_title("Box Plot Comparison")
axes[1].grid(axis="y", alpha=0.3)

# Violin plot
positions = [1, 2, 3]
parts = axes[2].violinplot(
    [collaborative, search_only, rulebased],
    positions=positions,
    showmeans=True,
    showmedians=True,
)
axes[2].set_xticks(positions)
axes[2].set_xticklabels(["Collaborative", "Search-only", "Rule-based"])
axes[2].set_ylabel("Final Fitness")
axes[2].set_title("Violin Plot Comparison")
axes[2].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Normality Testing

In [None]:
# Shapiro-Wilk test for normality
def test_normality(data, name):
    """Test normality using Shapiro-Wilk test."""
    stat, p = stats.shapiro(data)
    print(f"\n{name}:")
    print(f"  W-statistic: {stat:.4f}")
    print(f"  p-value: {p:.4f}")
    if p > 0.05:
        print(f"  ✓ Normality not rejected (p={p:.4f} > 0.05)")
    else:
        print(f"  ⚠ Normality rejected (p={p:.4f} < 0.05)")


print("Shapiro-Wilk Normality Tests:")
test_normality(collaborative, "Collaborative")
test_normality(search_only, "Search-only")
test_normality(rulebased, "Rule-based")

In [None]:
# Q-Q plots to visually assess normality
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, data_arr, name in zip(
    axes,
    [collaborative, search_only, rulebased],
    ["Collaborative", "Search-only", "Rule-based"],
):
    stats.probplot(data_arr, dist="norm", plot=ax)
    ax.set_title(f"Q-Q Plot: {name}")
    ax.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Statistical Tests

In [None]:
# Welch's t-test (collaborative vs. rule-based)
t_stat, p_value = stats.ttest_ind(collaborative, rulebased, equal_var=False)

print("Welch's t-test (Collaborative vs. Rule-based):")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Significant at α=0.05: {p_value < 0.05}")

# Cohen's d effect size
pooled_std = np.sqrt((collaborative.std(ddof=1) ** 2 + rulebased.std(ddof=1) ** 2) / 2)
cohens_d = (collaborative.mean() - rulebased.mean()) / pooled_std

print(f"\nCohen's d effect size: {cohens_d:.4f}")
if abs(cohens_d) < 0.2:
    print("  Interpretation: Small effect")
elif abs(cohens_d) < 0.5:
    print("  Interpretation: Small-to-medium effect")
elif abs(cohens_d) < 0.8:
    print("  Interpretation: Medium effect")
else:
    print("  Interpretation: Large effect")

if cohens_d < 0:
    print("  Direction: Collaborative UNDERPERFORMED rule-based")

In [None]:
# Compare all three modes with ANOVA
f_stat, p_anova = stats.f_oneway(collaborative, search_only, rulebased)

print("\nOne-way ANOVA (all three modes):")
print(f"  F-statistic: {f_stat:.4f}")
print(f"  p-value: {p_anova:.4f}")
print(f"  Significant at α=0.05: {p_anova < 0.05}")

if p_anova < 0.05:
    print("  ✓ At least one mode differs significantly from others")
else:
    print("  ⚠ No significant difference between modes")

## 6. Confidence Intervals

In [None]:
# Calculate 95% confidence intervals
def calculate_ci(data, confidence=0.95):
    """Calculate confidence interval using t-distribution."""
    n = len(data)
    mean = data.mean()
    se = stats.sem(data)
    margin = se * stats.t.ppf((1 + confidence) / 2, n - 1)
    return mean - margin, mean + margin


ci_collaborative = calculate_ci(collaborative)
ci_search_only = calculate_ci(search_only)
ci_rulebased = calculate_ci(rulebased)

print("95% Confidence Intervals:")
print(f"  Collaborative: [{ci_collaborative[0]:.1f}, {ci_collaborative[1]:.1f}]")
print(f"  Search-only:   [{ci_search_only[0]:.1f}, {ci_search_only[1]:.1f}]")
print(f"  Rule-based:    [{ci_rulebased[0]:.1f}, {ci_rulebased[1]:.1f}]")

# Check for overlap
overlap_start = max(ci_collaborative[0], ci_rulebased[0])
overlap_end = min(ci_collaborative[1], ci_rulebased[1])
overlap = max(0, overlap_end - overlap_start)
total_range = max(ci_collaborative[1], ci_rulebased[1]) - min(
    ci_collaborative[0], ci_rulebased[0]
)
overlap_pct = (overlap / total_range) * 100

print(f"\nCI overlap (Collaborative vs. Rule-based): {overlap_pct:.1f}%")
if overlap_pct < 5:
    print("  Strong evidence of difference")
elif overlap_pct < 25:
    print("  Moderate evidence of difference")
else:
    print("  Weak evidence of difference")

## 7. Emergence Factor Analysis

In [None]:
# Calculate emergence factor
emergence = metrics["emergence_factor"]
synergy = metrics["synergy_score"]
improvement_pct = metrics["improvement_pct"]

print("Emergence Analysis:")
print(f"  Emergence factor: {emergence:.4f}")
print("  Target: >1.1 (10% improvement)")
print(f"  Status: {'✓ PASSED' if emergence > 1.1 else '❌ FAILED'}")
print(f"\n  Synergy score: {synergy:.1f}")
print(f"  Improvement: {improvement_pct:.2f}%")

if emergence < 1.0:
    print(f"\n  ⚠ Collaborative UNDERPERFORMED by {(1 - emergence) * 100:.1f}%")
elif emergence < 1.1:
    print(
        f"\n  ⚠ Collaborative improved by {(emergence - 1) * 100:.1f}%, but below 10% target"
    )
else:
    print(f"\n  ✓ Collaborative exceeded target by {(emergence - 1.1) * 100:.1f}%")

In [None]:
# Visualize emergence factor
fig, ax = plt.subplots(figsize=(10, 6))

modes = ["Collaborative", "Rule-based\n(baseline)", "Emergence\nTarget"]
values = [
    metrics["collaborative_mean"],
    metrics["rulebased_mean"],
    metrics["rulebased_mean"] * 1.1,
]
colors = ["#e74c3c" if emergence < 1.1 else "#2ecc71", "#95a5a6", "#3498db"]

bars = ax.bar(modes, values, color=colors, alpha=0.7, edgecolor="black", linewidth=1.5)

# Add value labels
for bar, val in zip(bars, values):
    height = bar.get_height()
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        height,
        f"{val:.0f}",
        ha="center",
        va="bottom",
        fontweight="bold",
    )

ax.set_ylabel("Mean Final Fitness", fontweight="bold")
ax.set_title(
    f"Emergence Factor: {emergence:.3f} (Target: >1.1)", fontweight="bold", fontsize=14
)
ax.axhline(
    y=metrics["rulebased_mean"],
    color="gray",
    linestyle="--",
    alpha=0.5,
    label="Baseline",
)
ax.axhline(
    y=metrics["rulebased_mean"] * 1.1,
    color="blue",
    linestyle="--",
    alpha=0.5,
    label="Target (+10%)",
)
ax.legend()
ax.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Power Analysis

In [None]:
# Retrospective power analysis
from statsmodels.stats.power import ttest_power

# Observed effect size
observed_d = abs(cohens_d)
n_per_group = len(collaborative)

# Calculate achieved power
achieved_power = ttest_power(
    observed_d, n_per_group, alpha=0.05, alternative="two-sided"
)

print("Retrospective Power Analysis:")
print(f"  Observed effect size (d): {observed_d:.4f}")
print(f"  Sample size per group: {n_per_group}")
print(f"  Achieved power: {achieved_power:.2f}")
print("  Target power: 0.80")

if achieved_power < 0.80:
    print(f"\n  ⚠ Study was UNDERPOWERED (power={achieved_power:.2f} < 0.80)")

    # Calculate required sample size for 80% power
    from statsmodels.stats.power import tt_ind_solve_power

    required_n = tt_ind_solve_power(
        effect_size=observed_d, alpha=0.05, power=0.80, alternative="two-sided"
    )
    print(f"  Required N per group for 80% power: {int(np.ceil(required_n))}")
else:
    print("\n  ✓ Study was adequately powered")

## 9. Hypothesis Testing Summary

In [None]:
# Display H1a criteria results
print("H1a Hypothesis Testing Summary:")
print("\nSuccess Criteria:")
print(
    f"  1. Emergence factor >1.1:  {h1a_criteria['emergence_factor']} (observed: {emergence:.3f})"
)
print(
    f"  2. Significance p<0.05:    {h1a_criteria['significance']} (observed: {p_value:.4f})"
)
print(
    f"  3. Effect size d≥0.5:      {h1a_criteria['effect_size']} (observed: {cohens_d:.3f})"
)
print(f"\nOverall H1a Status: {h1a_criteria['overall_success']}")

if not h1a_criteria["overall_success"]:
    print("\n❌ H1a NOT SUPPORTED")
    print("\nInterpretation:")
    print("  - Collaborative mode UNDERPERFORMED rule-based baseline")
    print("  - No statistical evidence of benefit from feedback-guided mutations")
    print("  - Medium negative effect size suggests meaningful underperformance")
    print(
        "  - Root cause: Phase 1 MVP lacks feedback integration (planned for Phase 2)"
    )
else:
    print("\n✓ H1a SUPPORTED")

## 10. Additional Exploration

In [None]:
# Run-to-run variability
print("Run-to-run Variability:")
print(
    f"  Collaborative CV: {(collaborative.std(ddof=1) / collaborative.mean()) * 100:.1f}%"
)
print(
    f"  Search-only CV:   {(search_only.std(ddof=1) / search_only.mean()) * 100:.1f}%"
)
print(f"  Rule-based CV:    {(rulebased.std(ddof=1) / rulebased.mean()) * 100:.1f}%")

print("\nInterpretation:")
if (search_only.std(ddof=1) / search_only.mean()) > 0.10:
    print("  Search-only shows HIGH variability (CV>10%) - pure random search")
if (rulebased.std(ddof=1) / rulebased.mean()) < 0.10:
    print(
        "  Rule-based shows MODERATE variability (CV<10%) - selection pressure reduces variance"
    )
if (collaborative.std(ddof=1) / collaborative.mean()) > (
    rulebased.std(ddof=1) / rulebased.mean()
):
    print(
        "  Collaborative MORE variable than rule-based - suggests lack of convergence"
    )

In [None]:
# Pairwise comparisons
print("\nPairwise Comparisons (Welch's t-test):")

# Collaborative vs. Search-only
t1, p1 = stats.ttest_ind(collaborative, search_only, equal_var=False)
d1 = (collaborative.mean() - search_only.mean()) / np.sqrt(
    (collaborative.std(ddof=1) ** 2 + search_only.std(ddof=1) ** 2) / 2
)
print("\n  Collaborative vs. Search-only:")
print(f"    t={t1:.3f}, p={p1:.4f}, d={d1:.3f}")
print(f"    Significant: {p1 < 0.05}")

# Collaborative vs. Rule-based (already calculated)
print("\n  Collaborative vs. Rule-based:")
print(f"    t={t_stat:.3f}, p={p_value:.4f}, d={cohens_d:.3f}")
print(f"    Significant: {p_value < 0.05}")

# Search-only vs. Rule-based
t2, p2 = stats.ttest_ind(search_only, rulebased, equal_var=False)
d2 = (search_only.mean() - rulebased.mean()) / np.sqrt(
    (search_only.std(ddof=1) ** 2 + rulebased.std(ddof=1) ** 2) / 2
)
print("\n  Search-only vs. Rule-based:")
print(f"    t={t2:.3f}, p={p2:.4f}, d={d2:.3f}")
print(f"    Significant: {p2 < 0.05}")

# Bonferroni correction for multiple comparisons
alpha_bonferroni = 0.05 / 3
print(f"\n  Bonferroni-corrected α: {alpha_bonferroni:.4f}")
print(f"    Collaborative vs. Search-only: {p1 < alpha_bonferroni}")
print(f"    Collaborative vs. Rule-based: {p_value < alpha_bonferroni}")
print(f"    Search-only vs. Rule-based: {p2 < alpha_bonferroni}")

## 11. Conclusions

**Summary of Findings**:

1. **Hypothesis H1a NOT SUPPORTED**: Collaborative mode underperformed rule-based baseline by 4.6%

2. **Statistical Evidence**: 
   - No significant difference (p=0.579 > 0.05)
   - Medium negative effect size (d=-0.58)
   - 95% CIs show minimal overlap (~9%)

3. **Root Cause**: Phase 1 MVP validated infrastructure but deferred feedback integration to Phase 2

4. **Practical Implications**:
   - Rule-based evolution remains strongest approach
   - Collaborative mode needs Phase 2 implementation
   - Infrastructure works correctly (agents communicate, no crashes)

5. **Next Steps**:
   - Implement Phase 2 feedback integration
   - Re-run C2 validation with LLM-guided mutations
   - Consider alternative hypotheses (H2: explainability, H3: hybrid approaches)

**Final Assessment**: Negative result is scientifically valuable and guides Phase 2 design.