# QuartumSE Benchmark Suite

This notebook benchmarks **classical shadows** against direct measurement baselines for a single circuit.

## The 8 Tasks (Measurements Bible)

| Task | Question |
|------|----------|
| 1 | What N* achieves max SE ≤ ε for ALL observables? |
| 2 | What N* achieves mean SE ≤ ε? |
| 3 | What is the SE distribution at fixed N? |
| 4 | Which protocol wins on more observables? |
| 5 | What fraction of budget for pilot? |
| 6 | How does MSE decompose into bias² + variance? |
| 7 | How does performance degrade with noise? |
| 8 | Can budget reallocation improve results? |

In [None]:
# --- Setup ---
import sys
sys.path.insert(0, '../src')

import numpy as np
from collections import Counter, defaultdict
from qiskit import QuantumCircuit

from quartumse import (
    run_benchmark_suite,
    BenchmarkMode,
    BenchmarkSuiteConfig,
    generate_observable_set,
    Observable,
    ObservableSet,
)

print("Setup complete!")

---

## 1. Circuit Configuration

Define the circuit and observables to benchmark.

In [None]:
# =============================================================================
# CIRCUIT CONFIGURATION
# =============================================================================

CIRCUIT_ID = "ghz_4q"  # Identifier for this benchmark
N_QUBITS = 4

# --- Build Circuit ---
def build_ghz(n_qubits: int) -> QuantumCircuit:
    """GHZ state: (|00...0⟩ + |11...1⟩) / sqrt(2)"""
    qc = QuantumCircuit(n_qubits)
    qc.h(0)
    for i in range(1, n_qubits):
        qc.cx(i - 1, i)
    return qc

circuit = build_ghz(N_QUBITS)
print(f"Circuit: {CIRCUIT_ID}")
print(circuit.draw('text'))

In [None]:
# --- Observables ---
observables = []

# Random observables with mixed localities
for k in range(1, N_QUBITS + 1):
    obs_set = generate_observable_set(
        generator_id='random_pauli',
        n_qubits=N_QUBITS,
        n_observables=5,
        seed=42 + k,
        weight_distribution='fixed',
        fixed_weight=k,
    )
    observables.extend(list(obs_set.observables))

# Add GHZ stabilizers
observables.extend([
    Observable('Z' * N_QUBITS),
    Observable('X' * N_QUBITS),
])

obs_set = ObservableSet(
    observables=observables,
    observable_set_id=f'{CIRCUIT_ID}_obs',
    generator_id='mixed',
    generator_seed=42,
)

# Build locality map
locality_map = {}
for obs in observables:
    locality = sum(1 for c in obs.pauli_string if c != 'I')
    locality_map[obs.observable_id] = locality

print(f"Observables: {len(obs_set)}")
loc_counts = Counter(locality_map.values())
for k in sorted(loc_counts.keys()):
    print(f"  K={k}: {loc_counts[k]} observables")

---

## 2. Benchmark Configuration

In [None]:
# =============================================================================
# BENCHMARK CONFIGURATION
# =============================================================================

config = BenchmarkSuiteConfig(
    mode=BenchmarkMode.ANALYSIS,      # Full analysis with all tasks
    n_shots_grid=[100, 500, 1000, 5000],
    n_replicates=20,                  # 20 for publication quality
    seed=42,
    epsilon=0.01,                     # Target precision
    delta=0.05,                       # Failure probability
    shadows_protocol_id="classical_shadows_v0",
    baseline_protocol_id="direct_grouped",
    output_base_dir="benchmark_results",
)

print("Benchmark Configuration:")
print(f"  Mode: {config.mode.value}")
print(f"  Shot grid: {config.n_shots_grid}")
print(f"  Replicates: {config.n_replicates}")
print(f"  Target ε: {config.epsilon}")

---

## 3. Run Benchmark

In [None]:
%%time
# =============================================================================
# RUN BENCHMARK
# =============================================================================

result = run_benchmark_suite(
    circuit=circuit,
    observable_set=obs_set,
    circuit_id=CIRCUIT_ID,
    config=config,
    locality_map=locality_map,
)

---

## 4. Task Summary Report

Clear quantitative answers to each of the 8 benchmark questions.

In [None]:
# =============================================================================
# TASK SUMMARY REPORT
# =============================================================================

long_form = result.long_form_results
truth_values = result.ground_truth.truth_values if result.ground_truth else {}
max_n = max(result.summary.get('n_shots_grid', [5000]))
epsilon = config.epsilon

# Group data by protocol and N
by_protocol_n = defaultdict(lambda: defaultdict(list))
for row in long_form:
    by_protocol_n[row.protocol_id][row.N_total].append(row)

protocols = list(by_protocol_n.keys())

print("=" * 80)
print("BENCHMARK TASK SUMMARY REPORT")
print("=" * 80)
print(f"\nCircuit: {CIRCUIT_ID}")
print(f"Qubits: {N_QUBITS}")
print(f"Observables: {len(obs_set)}")
print(f"Protocols: {', '.join(protocols)}")
print(f"Shot Grid: {result.summary.get('n_shots_grid', [])}")
print(f"Replicates: {config.n_replicates}")
print()

In [None]:
# --- TASK 1: Worst-Case Guarantee ---
print("-" * 80)
print("TASK 1: WORST-CASE GUARANTEE")
print("Question: What N* achieves max SE ≤ ε for ALL observables?")
print("-" * 80)
print(f"\nTarget ε = {epsilon}")
print()

for protocol in protocols:
    n_star = None
    for n in sorted(by_protocol_n[protocol].keys()):
        rows = by_protocol_n[protocol][n]
        max_se = max(r.se for r in rows if r.se is not None)
        if max_se <= epsilon:
            n_star = n
            break
    
    if n_star:
        print(f"  {protocol}: N* = {n_star} shots")
    else:
        rows = by_protocol_n[protocol][max_n]
        max_se = max(r.se for r in rows if r.se is not None)
        print(f"  {protocol}: N* > {max_n} (max SE = {max_se:.4f} at N={max_n})")

In [None]:
# --- TASK 2: Average Target ---
print("-" * 80)
print("TASK 2: AVERAGE TARGET")
print("Question: What N* achieves mean SE ≤ ε?")
print("-" * 80)
print(f"\nTarget ε = {epsilon}")
print()

for protocol in protocols:
    n_star = None
    for n in sorted(by_protocol_n[protocol].keys()):
        rows = by_protocol_n[protocol][n]
        mean_se = np.mean([r.se for r in rows if r.se is not None])
        if mean_se <= epsilon:
            n_star = n
            break
    
    if n_star:
        print(f"  {protocol}: N* = {n_star} shots")
    else:
        rows = by_protocol_n[protocol][max_n]
        mean_se = np.mean([r.se for r in rows if r.se is not None])
        print(f"  {protocol}: N* > {max_n} (mean SE = {mean_se:.4f} at N={max_n})")

In [None]:
# --- TASK 3: Fixed Budget Distribution ---
print("-" * 80)
print("TASK 3: FIXED BUDGET DISTRIBUTION")
print(f"Question: What is the SE distribution at N = {max_n}?")
print("-" * 80)
print()
print(f"{'Protocol':<25} {'Mean SE':>10} {'Median SE':>10} {'Max SE':>10} {'Std SE':>10}")
print("-" * 70)

for protocol in protocols:
    rows = by_protocol_n[protocol][max_n]
    ses = [r.se for r in rows if r.se is not None]
    if ses:
        print(f"{protocol:<25} {np.mean(ses):>10.4f} {np.median(ses):>10.4f} "
              f"{np.max(ses):>10.4f} {np.std(ses):>10.4f}")

In [None]:
# --- TASK 4: Dominance ---
print("-" * 80)
print("TASK 4: DOMINANCE")
print("Question: Which protocol wins on more observables?")
print("-" * 80)
print()

# Find best protocol for each observable at max N
obs_best = defaultdict(lambda: {'se': float('inf'), 'protocol': None})
for protocol in protocols:
    rows = by_protocol_n[protocol][max_n]
    for row in rows:
        if row.se < obs_best[row.observable_id]['se']:
            obs_best[row.observable_id] = {'se': row.se, 'protocol': protocol}

wins = defaultdict(int)
for obs_id, data in obs_best.items():
    if data['protocol']:
        wins[data['protocol']] += 1

total_obs = len(obs_best)
print(f"At N = {max_n}:")
for protocol in protocols:
    win_count = wins[protocol]
    win_pct = 100 * win_count / total_obs if total_obs > 0 else 0
    print(f"  {protocol}: wins on {win_count}/{total_obs} observables ({win_pct:.1f}%)")

if wins:
    overall_winner = max(wins, key=wins.get)
    print(f"\n  WINNER: {overall_winner}")

In [None]:
# --- TASK 5: Pilot Selection ---
print("-" * 80)
print("TASK 5: PILOT SELECTION")
print("Question: What fraction of budget should be used for pilot?")
print("-" * 80)
print()

if result.analysis and hasattr(result.analysis, 'pilot_analysis') and result.analysis.pilot_analysis:
    pilot = result.analysis.pilot_analysis
    print(f"{'Pilot %':>10} {'Accuracy':>12} {'Mean Regret':>12}")
    print("-" * 40)
    for frac, res in sorted(pilot.results.items()):
        print(f"{frac*100:>9.0f}% {res.selection_accuracy*100:>11.1f}% {res.mean_regret:>12.4f}")
    print(f"\n  OPTIMAL PILOT: {pilot.optimal_fraction*100:.0f}% of budget")
else:
    print("  (Requires analysis mode for detailed pilot study)")

In [None]:
# --- TASK 6: Bias-Variance Decomposition ---
print("-" * 80)
print("TASK 6: BIAS-VARIANCE DECOMPOSITION")
print("Question: How does MSE decompose into bias² + variance?")
print("-" * 80)
print()

if truth_values:
    print(f"{'Protocol':<25} {'Bias²':>12} {'Variance':>12} {'MSE':>12}")
    print("-" * 65)
    
    for protocol in protocols:
        rows = by_protocol_n[protocol][max_n]
        
        # Group by observable
        by_obs = defaultdict(list)
        for row in rows:
            if row.observable_id in truth_values:
                by_obs[row.observable_id].append(row.estimate)
        
        biases_sq = []
        variances = []
        for obs_id, estimates in by_obs.items():
            truth = truth_values[obs_id]
            mean_est = np.mean(estimates)
            bias = mean_est - truth
            var = np.var(estimates)
            biases_sq.append(bias**2)
            variances.append(var)
        
        if biases_sq:
            mean_bias_sq = np.mean(biases_sq)
            mean_var = np.mean(variances)
            mse = mean_bias_sq + mean_var
            print(f"{protocol:<25} {mean_bias_sq:>12.6f} {mean_var:>12.6f} {mse:>12.6f}")
else:
    print("  (Requires ground truth for bias-variance decomposition)")

In [None]:
# --- TASK 7: Noise Sensitivity ---
print("-" * 80)
print("TASK 7: NOISE SENSITIVITY")
print("Question: How does performance degrade with noise?")
print("-" * 80)
print()
print("  (Requires running benchmark with multiple noise profiles)")
print("  Current run: ideal (noiseless) simulation")
print()

# --- TASK 8: Adaptive Efficiency ---
print("-" * 80)
print("TASK 8: ADAPTIVE EFFICIENCY")
print("Question: Can budget reallocation improve results?")
print("-" * 80)
print()
print("  (Requires two-stage adaptive protocol implementation)")
print("  See pilot analysis (Task 5) for related insights")

In [None]:
# =============================================================================
# EXECUTIVE SUMMARY
# =============================================================================

print()
print("=" * 80)
print("EXECUTIVE SUMMARY")
print("=" * 80)
print()

protocol_summaries = result.summary.get('protocol_summaries', {})
if protocol_summaries:
    best_by_mean = min(protocol_summaries, key=lambda p: protocol_summaries[p].get('mean_se', float('inf')))
    best_by_max = min(protocol_summaries, key=lambda p: protocol_summaries[p].get('max_se', float('inf')))
    
    print(f"Best protocol (mean SE): {best_by_mean}")
    print(f"Best protocol (max SE):  {best_by_max}")
    print()
    
    # Shadows vs baseline comparison
    if 'classical_shadows_v0' in protocol_summaries and 'direct_grouped' in protocol_summaries:
        shadows_se = protocol_summaries['classical_shadows_v0'].get('mean_se', 1)
        grouped_se = protocol_summaries['direct_grouped'].get('mean_se', 1)
        ratio = shadows_se / grouped_se if grouped_se > 0 else float('inf')
        
        print(f"Shadows vs Grouped SE ratio: {ratio:.2f}x")
        if ratio < 1:
            ssf = 1 / (ratio ** 2)  # Shot savings factor
            print(f"  → Classical shadows is {1/ratio:.1f}x more efficient")
            print(f"  → Shot Savings Factor (SSF): {ssf:.1f}x")
        else:
            print(f"  → Direct grouped is {ratio:.1f}x more efficient")

print()
print("=" * 80)
print(f"Full results saved to: {result.output_dir}")
print("=" * 80)

---

## Summary

This benchmark answered the 8 Measurements Bible questions:

| Task | Question | Answer Format |
|------|----------|---------------|
| 1 | Worst-case N*? | N* = X shots (or N* > max_N) |
| 2 | Average N*? | N* = X shots (or N* > max_N) |
| 3 | SE distribution? | mean, median, max, std |
| 4 | Dominance? | Protocol X wins Y% |
| 5 | Pilot fraction? | Optimal = X% |
| 6 | Bias-variance? | Bias² = X, Var = Y, MSE = Z |
| 7 | Noise sensitivity? | (requires noise sweep) |
| 8 | Adaptive efficiency? | (requires adaptive protocol) |

All results saved to timestamped directory for reproducibility.