# Comprehensive Benchmark Analysis Suite

This notebook runs the **complete enhanced benchmarking suite** implementing all improvements from `Benchmarking_Improvement.md`:

## Improvements Implemented

| Category | Enhancement | Description |
|----------|-------------|-------------|
| **Task 1** | N* Interpolation | Power-law fit (SE ∝ N^-0.5) instead of grid search |
| **Task 1** | 95th Percentile | More robust than strict max |
| **Task 3** | K-S Tests | Statistical test for distribution differences |
| **Task 4** | Per-Observable Crossover | Which observables each protocol wins |
| **Task 4** | Soft Dominance | "Wins on 90%+ observables" criterion |
| **Task 5** | Multiple Pilot Fractions | Test 2%, 5%, 10%, 20% |
| **New** | Observable Property Analysis | Correlation with locality, commutation |
| **New** | Bootstrap Hypothesis Testing | Statistical significance for all claims |
| **New** | Cost-Normalized Metrics | Penalize circuit depth overhead |
| **New** | Sample Complexity Curves | Visual intuition for scaling |

## Output
- Detailed per-task analysis with enhancements
- Statistical significance testing
- Per-observable breakdown
- Publication-ready summary report

In [1]:
# --- Setup ---
import sys
from pathlib import Path
import json
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, HTML

sys.path.insert(0, '../src')

from qiskit import QuantumCircuit

# Core benchmarking
from quartumse.benchmarking import run_publication_benchmark
from quartumse.observables import Observable, ObservableSet, generate_observable_set
from quartumse.protocols import DirectNaiveProtocol, DirectGroupedProtocol, DirectOptimizedProtocol
from quartumse.protocols.shadows import ShadowsV0Protocol

# Enhanced analysis
from quartumse.analysis import (
    run_comprehensive_analysis,
    interpolate_n_star,
    per_observable_crossover,
    analyze_by_locality,
    compare_protocols_statistically,
    bootstrap_ci,
    compute_cost_normalized_metrics,
    CostModel,
    multi_pilot_analysis,
)

print("Setup complete!")

Setup complete!


## 1. Configuration

In [2]:
# --- Configuration ---
SEED = 42
N_QUBITS = 4
N_OBSERVABLES = 30
N_SHOTS_GRID = [100, 500, 1000, 5000]
N_REPLICATES = 20
EPSILON = 0.01
DELTA = 0.05

# Protocol IDs for comparison
SHADOWS_PROTOCOL = "classical_shadows_v0"
BASELINE_PROTOCOL = "direct_grouped"

OUTPUT_DIR = Path('results/comprehensive_benchmark')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Configuration:")
print(f"  Qubits: {N_QUBITS}")
print(f"  Observables: {N_OBSERVABLES}")
print(f"  Shot grid: {N_SHOTS_GRID}")
print(f"  Replicates: {N_REPLICATES}")
print(f"  Target epsilon: {EPSILON}")
print(f"  Output: {OUTPUT_DIR}")

Configuration:
  Qubits: 4
  Observables: 30
  Shot grid: [100, 500, 1000, 5000]
  Replicates: 20
  Target epsilon: 0.01
  Output: results\comprehensive_benchmark


## 2. Circuit and Observables

In [3]:
# --- GHZ Circuit ---
def build_ghz(n_qubits: int) -> QuantumCircuit:
    qc = QuantumCircuit(n_qubits)
    qc.h(0)
    for i in range(1, n_qubits):
        qc.cx(i - 1, i)
    return qc

circuit = build_ghz(N_QUBITS)
print(circuit.draw('text'))

     ┌───┐               
q_0: ┤ H ├──■────────────
     └───┘┌─┴─┐          
q_1: ─────┤ X ├──■───────
          └───┘┌─┴─┐     
q_2: ──────────┤ X ├──■──
               └───┘┌─┴─┐
q_3: ───────────────┤ X ├
                    └───┘


In [4]:
# --- Generate observables with mixed localities ---
def generate_mixed_locality_observables(n_qubits, n_per_locality, seed):
    """Generate observables with different localities for comprehensive testing."""
    all_obs = []
    
    for k in range(1, n_qubits + 1):
        obs_set = generate_observable_set(
            generator_id='random_pauli',
            n_qubits=n_qubits,
            n_observables=n_per_locality,
            seed=seed + k,
            weight_distribution='fixed',
            fixed_weight=k,
        )
        all_obs.extend(list(obs_set.observables))
    
    return all_obs

# Generate observables
n_per_locality = N_OBSERVABLES // N_QUBITS
observables = generate_mixed_locality_observables(N_QUBITS, n_per_locality, SEED)

# Add GHZ stabilizers
ghz_stabilizers = [
    Observable('Z' * N_QUBITS),
    Observable('X' * N_QUBITS),
]
observables.extend(ghz_stabilizers)

obs_set = ObservableSet(
    observables=observables,
    observable_set_id='mixed_locality_set',
    generator_id='mixed_locality',
    generator_seed=SEED,
)

# Build locality map
locality_map = {}
for obs in observables:
    locality = sum(1 for c in obs.pauli_string if c != 'I')
    locality_map[obs.observable_id] = locality

print(f"Generated {len(obs_set)} observables")
print(f"\nLocality distribution:")
from collections import Counter
loc_counts = Counter(locality_map.values())
for k in sorted(loc_counts.keys()):
    print(f"  K={k}: {loc_counts[k]} observables")

Generated 30 observables

Locality distribution:
  K=1: 7 observables
  K=2: 7 observables
  K=3: 7 observables
  K=4: 9 observables


## 3. Run Base Benchmark

In [5]:
# --- Protocols ---
protocols = [
    DirectNaiveProtocol(),
    DirectGroupedProtocol(),
    DirectOptimizedProtocol(),
    ShadowsV0Protocol(),
]

print("Protocols:")
for p in protocols:
    print(f"  - {p.protocol_id}")

Protocols:
  - direct_naive
  - direct_grouped
  - direct_optimized
  - classical_shadows_v0


In [None]:
%%time
# --- Run publication benchmark ---
print("Running publication benchmark...")
print(f"This will execute {len(protocols)} protocols × {len(N_SHOTS_GRID)} shot budgets × {N_REPLICATES} replicates")
print()

results = run_publication_benchmark(
    circuit=circuit,
    observable_set=obs_set,
    protocols=protocols,
    n_shots_grid=N_SHOTS_GRID,
    n_replicates=N_REPLICATES,
    seed=SEED,
    output_dir=str(OUTPUT_DIR),
    epsilon=EPSILON,
    delta=DELTA,
)

print(f"\nBenchmark completed!")
print(f"Run ID: {results['summary']['run_id']}")
print(f"Long-form rows: {results['summary']['n_long_form_rows']}")

Running publication benchmark...
This will execute 4 protocols × 4 shot budgets × 20 replicates



---

# Enhanced Analysis Suite

Now we run all the enhanced analyses from `Benchmarking_Improvement.md`.

In [None]:
%%time
# --- Run comprehensive analysis ---
print("Running comprehensive enhanced analysis...")

long_form_rows = results['long_form_results']
truth_values = results['ground_truth'].truth_values if results['ground_truth'] else None

analysis = run_comprehensive_analysis(
    long_form_results=long_form_rows,
    truth_values=truth_values,
    epsilon=EPSILON,
    delta=DELTA,
    locality_map=locality_map,
    run_id=results['summary']['run_id'],
    shadows_protocol_id=SHADOWS_PROTOCOL,
    baseline_protocol_id=BASELINE_PROTOCOL,
)

print("Comprehensive analysis complete!")

---

## 4. Executive Summary

In [None]:
# --- Executive Summary ---
summary = analysis.summary

print("="*70)
print("EXECUTIVE SUMMARY")
print("="*70)
print()

for key, value in summary.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.4f}")
    else:
        print(f"  {key}: {value}")

print()
print("="*70)

---

## 5. Task 1: Worst-Case Analysis (Enhanced)

**Enhancements:**
- N* via power-law interpolation (not grid search)
- 95th percentile as alternative to strict max

In [None]:
# --- Task 1: Worst-Case ---
task1 = analysis.task_analyses.get('task1_worst_case')

print("TASK 1: WORST-CASE GUARANTEE")
print("-" * 50)
print()

print("Grid-based N* (original):")
for protocol, data in task1.base_results.items():
    print(f"  {protocol}: N* = {data.get('n_star', 'N/A')}")

print()
print("95th Percentile N* (enhanced):")
for protocol, data in task1.enhanced_results.get('percentile_95_n_star', {}).items():
    print(f"  {protocol}: N* (95th pct) = {data.get('n_star_95th', 'N/A')}")

print()
print("Interpolated N* (power-law):")
for protocol, data in analysis.interpolated_n_star.items():
    n_star = data.get('n_star_interpolated')
    r_sq = data.get('r_squared', 0)
    if n_star:
        print(f"  {protocol}: N* = {n_star:.0f} (R² = {r_sq:.3f})")
    else:
        print(f"  {protocol}: N* not reached in grid")

---

## 6. Task 3: Fixed Budget Distribution (Enhanced)

**Enhancements:**
- K-S test for distribution differences
- Bootstrap CIs on comparisons

In [None]:
# --- Task 3: Fixed Budget Distribution ---
print("TASK 3: FIXED BUDGET DISTRIBUTION")
print("-" * 50)
print()

# Get distributions at max N
max_n = N_SHOTS_GRID[-1]
task3 = analysis.task_analyses.get('task3_distribution')

if task3:
    distributions = task3.base_results.get('distributions', {})
    
    print(f"SE Distribution at N = {max_n}:")
    print()
    print(f"{'Protocol':<25} {'Mean SE':>10} {'Median SE':>10} {'Max SE':>10} {'Std SE':>10}")
    print("-" * 70)
    
    for protocol, by_n in distributions.items():
        if str(max_n) in by_n or max_n in by_n:
            stats = by_n.get(str(max_n)) or by_n.get(max_n, {})
            print(f"{protocol:<25} {stats.get('mean', 0):>10.4f} {stats.get('median', 0):>10.4f} "
                  f"{stats.get('max', 0):>10.4f} {stats.get('std', 0):>10.4f}")

In [None]:
# --- Statistical Significance ---
print("\nSTATISTICAL SIGNIFICANCE TESTS")
print("-" * 50)
print()

print(f"Comparison: {SHADOWS_PROTOCOL} vs {BASELINE_PROTOCOL}")
print()

print(f"{'N':>6} {'Diff P-value':>12} {'K-S P-value':>12} {'Reject H0':>10} {'SSF':>8} {'SSF 95% CI':>20}")
print("-" * 75)

for n, comp in sorted(analysis.statistical_comparison.items()):
    diff_p = comp.difference_test.p_value
    ks_p = comp.ks_test.p_value
    reject = "Yes" if comp.difference_test.reject_null else "No"
    
    ssf = comp.ssf_ci
    if ssf and not np.isnan(ssf.ci_low):
        ssf_str = f"{ssf.estimate:.2f}"
        ssf_ci = f"[{ssf.ci_low:.2f}, {ssf.ci_high:.2f}]"
    else:
        ssf_str = "N/A"
        ssf_ci = "N/A"
    
    print(f"{n:>6} {diff_p:>12.4f} {ks_p:>12.4f} {reject:>10} {ssf_str:>8} {ssf_ci:>20}")

---

## 7. Task 4: Dominance Analysis (Enhanced)

**Enhancements:**
- Per-observable crossover points
- Soft dominance (90% threshold)
- Performance by locality

In [None]:
# --- Task 4: Dominance ---
print("TASK 4: DOMINANCE & CROSSOVER ANALYSIS")
print("-" * 50)
print()

task4 = analysis.task_analyses.get('task4_dominance')
if task4:
    dom = task4.base_results
    print(f"Shadows wins at N: {dom.get('a_dominates_at', [])}")
    print(f"Grouped wins at N: {dom.get('b_dominates_at', [])}")
    print(f"Crossover N: {dom.get('crossover_n', 'None')}")
    print(f"Always shadows better: {dom.get('always_a_better', False)}")
    print(f"Always grouped better: {dom.get('always_b_better', False)}")

In [None]:
# --- Per-Observable Crossover ---
print("\nPER-OBSERVABLE CROSSOVER ANALYSIS")
print("-" * 50)
print()

crossover = analysis.crossover_analysis
if crossover:
    summary = crossover.summary
    print(f"Total observables: {summary['n_observables']}")
    print(f"Shadows always wins: {summary['a_always_wins']} ({summary['a_win_fraction']*100:.1f}%)")
    print(f"Grouped always wins: {summary['b_always_wins']} ({summary['b_win_fraction']*100:.1f}%)")
    print(f"Has crossover: {summary['has_crossover']} ({summary['crossover_fraction']*100:.1f}%)")
    
    # Soft dominance
    print()
    soft_90 = crossover.soft_dominance(threshold=0.90)
    soft_80 = crossover.soft_dominance(threshold=0.80)
    print(f"Soft dominance (90%): Shadows={soft_90['a_soft_dominates']}, Grouped={soft_90['b_soft_dominates']}")
    print(f"Soft dominance (80%): Shadows={soft_80['a_soft_dominates']}, Grouped={soft_80['b_soft_dominates']}")
    
    # By locality
    print()
    print("Win fraction by locality:")
    by_loc = crossover.by_locality()
    print(f"{'K':>4} {'N Obs':>8} {'Shadows':>10} {'Grouped':>10}")
    print("-" * 35)
    for k, stats in by_loc.items():
        print(f"{k:>4} {stats['n_observables']:>8} {stats['a_win_fraction']*100:>9.1f}% {stats['b_win_fraction']*100:>9.1f}%")

---

## 8. Task 5: Pilot Selection (Enhanced)

**Enhancements:**
- Multiple pilot fractions (2%, 5%, 10%, 20%)
- Regret analysis

In [None]:
# --- Task 5: Pilot Selection ---
print("TASK 5: MULTI-PILOT FRACTION ANALYSIS")
print("-" * 50)
print()

pilot = analysis.pilot_analysis
if pilot:
    print(f"Target N: {pilot.target_n}")
    print()
    print(f"{'Pilot %':>10} {'Pilot N':>10} {'Accuracy':>10} {'Mean Regret':>12} {'Max Regret':>12}")
    print("-" * 60)
    
    for frac, result in sorted(pilot.results.items()):
        print(f"{frac*100:>9.0f}% {result.pilot_n:>10} {result.selection_accuracy*100:>9.1f}% "
              f"{result.mean_regret:>12.4f} {result.max_regret:>12.4f}")
    
    print()
    print(f"Optimal pilot fraction: {pilot.optimal_fraction*100:.0f}%" if pilot.optimal_fraction else "Optimal not determined")

---

## 9. Observable Property Analysis

**New analysis:** Correlation between observable locality and protocol performance.

In [None]:
# --- Locality Analysis ---
print("OBSERVABLE PROPERTY ANALYSIS (LOCALITY)")
print("-" * 50)
print()

for protocol_id, loc_data in analysis.locality_analysis.items():
    print(f"Protocol: {protocol_id}")
    print(f"  Locality-SE Correlation: {loc_data.get('locality_correlation', 0):.3f}")
    
    reg = loc_data.get('locality_regression', {})
    if reg:
        print(f"  Regression: SE = {reg.get('intercept', 0):.4f} + {reg.get('slope', 0):.4f} × K")
        print(f"  R² = {reg.get('r_squared', 0):.3f}")
    
    print(f"  Performance by locality:")
    by_loc = loc_data.get('by_locality', {})
    for k, stats in sorted(by_loc.items(), key=lambda x: int(x[0]) if x[0].isdigit() else 0):
        print(f"    K={k}: mean_se={stats.get('mean_se', 0):.4f}, "
              f"theoretical_var_factor={stats.get('theoretical_variance_factor', 0)}")
    print()

---

## 10. Cost-Normalized Analysis

**New analysis:** Account for circuit depth overhead.

In [None]:
# --- Cost-Normalized Analysis ---
print("COST-NORMALIZED COMPARISON")
print("-" * 50)
print()

cost = analysis.cost_analysis
print(f"Cost Model: {cost.get('cost_model', {})}")
print()

comparison = cost.get('comparison_at_max_n', {})
if 'protocols' in comparison:
    print(f"Comparison at N = {comparison.get('n_total')}:")
    print()
    print(f"{'Protocol':<25} {'Raw SE':>10} {'Cost':>10} {'Cost-Norm SE':>12}")
    print("-" * 60)
    
    for p in comparison['protocols']:
        print(f"{p['protocol_id']:<25} {p['raw_se']:>10.4f} {p['cost']:>10.0f} {p['cost_normalized_se']:>12.4f}")
    
    print()
    print(f"Winner (raw SE): {comparison.get('winner_raw_se')}")
    print(f"Winner (cost-normalized): {comparison.get('winner_cost_normalized')}")
    print(f"Ranking changed: {comparison.get('ranking_changed')}")

---

## 11. Visualizations

In [None]:
# --- Sample Complexity Curves ---
from collections import defaultdict

# Group results
by_protocol_n = defaultdict(lambda: defaultdict(list))
for row in long_form_rows:
    by_protocol_n[row.protocol_id][row.N_total].append(row.se)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: SE vs N
ax = axes[0]
colors = {'direct_naive': 'gray', 'direct_grouped': 'blue', 
          'direct_optimized': 'green', 'classical_shadows_v0': 'red'}

for protocol_id in [BASELINE_PROTOCOL, SHADOWS_PROTOCOL, 'direct_optimized', 'direct_naive']:
    if protocol_id not in by_protocol_n:
        continue
    ns = sorted(by_protocol_n[protocol_id].keys())
    mean_ses = [np.mean(by_protocol_n[protocol_id][n]) for n in ns]
    std_ses = [np.std(by_protocol_n[protocol_id][n]) for n in ns]
    
    ax.errorbar(ns, mean_ses, yerr=std_ses, marker='o', label=protocol_id,
                color=colors.get(protocol_id, 'black'), capsize=3)

ax.axhline(y=EPSILON, color='black', linestyle='--', label=f'Target ε={EPSILON}')
ax.set_xlabel('Number of Shots (N)', fontsize=12)
ax.set_ylabel('Mean Standard Error', fontsize=12)
ax.set_title('Sample Complexity Curves', fontsize=14)
ax.set_xscale('log')
ax.set_yscale('log')
ax.legend()
ax.grid(True, alpha=0.3)

# Plot 2: SE by locality
ax = axes[1]

for protocol_id in [BASELINE_PROTOCOL, SHADOWS_PROTOCOL]:
    loc_data = analysis.locality_analysis.get(protocol_id, {})
    by_loc = loc_data.get('by_locality', {})
    
    ks = []
    ses = []
    for k, stats in sorted(by_loc.items(), key=lambda x: int(x[0]) if str(x[0]).isdigit() else 0):
        ks.append(int(k))
        ses.append(stats.get('mean_se', 0))
    
    ax.plot(ks, ses, 'o-', label=protocol_id, color=colors.get(protocol_id, 'black'))

ax.set_xlabel('Locality (Pauli Weight K)', fontsize=12)
ax.set_ylabel('Mean Standard Error', fontsize=12)
ax.set_title(f'Performance by Observable Locality (N={N_SHOTS_GRID[-1]})', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'comprehensive_plots.png', dpi=150)
plt.show()

print(f"\nPlot saved to: {OUTPUT_DIR / 'comprehensive_plots.png'}")

In [None]:
# --- Pilot Selection Plot ---
if analysis.pilot_analysis and analysis.pilot_analysis.results:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    fracs = []
    accs = []
    regrets = []
    
    for frac, result in sorted(analysis.pilot_analysis.results.items()):
        fracs.append(frac * 100)
        accs.append(result.selection_accuracy * 100)
        regrets.append(result.mean_regret)
    
    # Accuracy plot
    axes[0].plot(fracs, accs, 'bo-', markersize=10)
    axes[0].axhline(y=25, color='gray', linestyle='--', label='Random (25%)')
    axes[0].set_xlabel('Pilot Fraction (%)', fontsize=12)
    axes[0].set_ylabel('Selection Accuracy (%)', fontsize=12)
    axes[0].set_title('Pilot Selection Accuracy', fontsize=14)
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Regret plot
    axes[1].plot(fracs, regrets, 'ro-', markersize=10)
    axes[1].set_xlabel('Pilot Fraction (%)', fontsize=12)
    axes[1].set_ylabel('Mean Regret', fontsize=12)
    axes[1].set_title('Selection Regret', fontsize=14)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'pilot_analysis.png', dpi=150)
    plt.show()

---

## 12. Generate Final Report

In [None]:
# --- Generate Markdown Report ---
report_content = analysis.generate_report()

# Add timestamp and config
header = f"""<!-- Generated: {datetime.now().isoformat()} -->
<!-- Config: {N_QUBITS} qubits, {len(obs_set)} observables, {N_REPLICATES} replicates -->

"""

full_report = header + report_content

# Save report
report_path = OUTPUT_DIR / 'comprehensive_report.md'
report_path.write_text(full_report, encoding='utf-8')
print(f"Report saved to: {report_path}")

# Save JSON
json_path = OUTPUT_DIR / 'comprehensive_analysis.json'
analysis.save(json_path)
print(f"JSON saved to: {json_path}")

In [None]:
# --- Display Report ---
display(Markdown(report_content))

---

## 13. Conclusions

In [None]:
print("="*70)
print("COMPREHENSIVE BENCHMARK CONCLUSIONS")
print("="*70)
print()

# Key findings
s = analysis.summary

print("1. OVERALL WINNER:")
print(f"   At N={N_SHOTS_GRID[-1]}: {s.get('winner_at_max_n', 'Unknown')}")
print(f"   Shadows/Baseline SE ratio: {s.get('shadows_vs_baseline_ratio', 0):.2f}")
print()

print("2. PER-OBSERVABLE ANALYSIS:")
print(f"   Shadows wins on {s.get('shadows_wins_fraction', 0)*100:.1f}% of observables")
print(f"   Baseline wins on {s.get('baseline_wins_fraction', 0)*100:.1f}% of observables")
print()

print("3. STATISTICAL SIGNIFICANCE:")
if analysis.statistical_comparison:
    max_n_comp = analysis.statistical_comparison.get(N_SHOTS_GRID[-1])
    if max_n_comp:
        print(f"   Difference p-value: {max_n_comp.difference_test.p_value:.4f}")
        print(f"   Reject null (α=0.05): {max_n_comp.difference_test.reject_null}")
print()

print("4. LOCALITY EFFECT:")
if analysis.crossover_analysis:
    by_loc = analysis.crossover_analysis.by_locality()
    for k, stats in sorted(by_loc.items()):
        winner = "Shadows" if stats['a_win_fraction'] > stats['b_win_fraction'] else "Baseline"
        print(f"   K={k}: {winner} wins ({max(stats['a_win_fraction'], stats['b_win_fraction'])*100:.0f}%)")
print()

print("5. PILOT SELECTION:")
if analysis.pilot_analysis and analysis.pilot_analysis.optimal_fraction:
    opt_frac = analysis.pilot_analysis.optimal_fraction
    opt_result = analysis.pilot_analysis.results.get(opt_frac)
    if opt_result:
        print(f"   Optimal pilot: {opt_frac*100:.0f}% of budget")
        print(f"   Selection accuracy: {opt_result.selection_accuracy*100:.1f}%")
print()

print("="*70)
print(f"Full report: {OUTPUT_DIR / 'comprehensive_report.md'}")
print(f"JSON data: {OUTPUT_DIR / 'comprehensive_analysis.json'}")
print("="*70)