# 16 — Confidence sequences: anytime-valid inference

This notebook demonstrates **anytime-valid inference** for sequential monitoring:
- **Confidence sequence (CS)**: a time-uniform confidence band that remains valid under optional stopping
- **Anytime p-values**: p-values that remain valid even if you stop based on what you see

Scope (v1):
- Two-group A/B, mean-difference (user-level)
- Normal approximation with empirical variance

Outputs:
- CS band across looks
- Anytime boundary / p-values across looks
- Example of safe stopping

Caveats:
- Assumes independence / weak dependence and CLT-like behavior.
- For heavy tails / drift, see notebook 18.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tecore.sequential.schema import (
    SequentialSpec,
    LookSchedule,
    SequentialConfig,
    SequentialMode,
    EffectDirection,
)
from tecore.sequential.preprocess import build_look_table_mean
from tecore.sequential.confidence_sequences import run_confidence_sequence
from tecore.sequential.simulate import SequentialSimConfig, simulate_ab_stream

plt.rcParams["figure.figsize"] = (9, 5)
plt.rcParams["axes.grid"] = True


## 1) Single run: effect trajectory, fixed CI vs confidence sequence


In [None]:
N = 12000
looks = [1000, 2000, 4000, 6000, 8000, 10000, 12000]
alpha = 0.05

# Simulate a modest effect
df = simulate_ab_stream(SequentialSimConfig(n=N, effect=0.08, noise_sd=1.0, heavy_tail=False, drift=False, seed=21, ratio=False))

spec = SequentialSpec(group_col="group", control_label="control", test_label="test", y_col="y", timestamp_col="timestamp")
schedule = LookSchedule(looks=looks)
cfg_cs = SequentialConfig(
    mode=SequentialMode.CONFIDENCE_SEQUENCE,
    alpha=alpha,
    two_sided=True,
    effect_direction=EffectDirection.TWO_SIDED,
    min_n_per_group=50,
    var_floor=1e-12,
    seed=21,
)

lt, warn = build_look_table_mean(df, spec, schedule, cfg_cs)
res = run_confidence_sequence(lt, cfg_cs)

display(res.look_table.head())
print("Decision:", res.decision, "stopped=", res.stopped, "stop_look=", res.stop_look)


In [None]:
# Plot 1: effect estimate trajectory with fixed CI and CS band
tab = res.look_table.copy()
x = tab["look_n"].to_numpy(dtype=int)
est = tab["diff"].to_numpy(dtype=float)
se = tab["se"].to_numpy(dtype=float)

# Fixed-horizon 95% CI (for reference only; NOT anytime-valid)
z_fixed = 1.959963984540054
fix_lo = est - z_fixed * se
fix_hi = est + z_fixed * se

# Confidence sequence band (anytime-valid)
cs_lo = pd.to_numeric(tab.get("cs_low"), errors="coerce").to_numpy(dtype=float)
cs_hi = pd.to_numeric(tab.get("cs_high"), errors="coerce").to_numpy(dtype=float)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, est, marker="o", label="estimate (diff)")
ax.fill_between(x, fix_lo, fix_hi, alpha=0.2, label="fixed 95% CI (reference)")
ax.fill_between(x, cs_lo, cs_hi, alpha=0.2, label="confidence sequence (anytime-valid)")
ax.axhline(0.0, linewidth=1.0)
ax.set_title("Effect trajectory: fixed CI vs confidence sequence")
ax.set_xlabel("look_n")
ax.set_ylabel("difference in means")
ax.legend(loc="best")
plt.show()


In [None]:
# Plot 2: z-trajectory and anytime boundary
z = tab["z"].to_numpy(dtype=float)
b = pd.to_numeric(tab.get("boundary_z"), errors="coerce").to_numpy(dtype=float)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, z, marker="o", label="z")
ax.plot(x, b, linestyle="--", label="anytime boundary")
ax.plot(x, -b, linestyle="--")
ax.axhline(0.0, linewidth=1.0)
ax.set_title("z trajectory vs anytime-valid boundary")
ax.set_xlabel("look_n")
ax.set_ylabel("z")
ax.legend(loc="best")
plt.show()


In [None]:
# Plot 3: anytime p-values across looks
p_any = pd.to_numeric(tab.get("p_anytime"), errors="coerce").to_numpy(dtype=float)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, p_any, marker="o", label="anytime p-value")
ax.axhline(alpha, linestyle="--", label="alpha")
ax.set_ylim(0, 1)
ax.set_title("Anytime-valid p-values across looks")
ax.set_xlabel("look_n")
ax.set_ylabel("p")
ax.legend(loc="best")
plt.show()


## 2) Simulation: optional stopping without breaking alpha

We estimate empirical type I error under the null when a user stops as soon as the anytime rule rejects.


In [None]:
n_sims = 250
N = 8000
looks = [800, 1600, 2400, 3200, 4000, 6000, 8000]
alpha = 0.05

spec = SequentialSpec(group_col="group", control_label="control", test_label="test", y_col="y", timestamp_col="timestamp")
schedule = LookSchedule(looks=looks)
cfg_cs = SequentialConfig(
    mode=SequentialMode.CONFIDENCE_SEQUENCE,
    alpha=alpha,
    two_sided=True,
    effect_direction=EffectDirection.TWO_SIDED,
    min_n_per_group=50,
    var_floor=1e-12,
)

reject = []
stop_looks = []
min_p = []

for s in range(n_sims):
    df0 = simulate_ab_stream(SequentialSimConfig(n=N, effect=0.0, noise_sd=1.0, heavy_tail=False, drift=False, seed=3000 + s, ratio=False))
    lt, _ = build_look_table_mean(df0, spec, schedule, cfg_cs)
    res0 = run_confidence_sequence(lt, cfg_cs)
    tab0 = res0.look_table
    reject.append(res0.decision == "reject")
    stop_looks.append(res0.stop_look if res0.stop_look is not None else looks[-1])
    p_any = pd.to_numeric(tab0.get("p_anytime"), errors="coerce").to_numpy(dtype=float)
    min_p.append(float(np.nanmin(p_any)))

type1 = float(np.mean(reject))
avg_stop = float(np.mean(stop_looks))

print("Empirical type I error (CS anytime-valid):", type1)
print("Average stop look (null):", avg_stop)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(min_p, bins=30)
ax.axvline(alpha, linestyle="--", label="alpha")
ax.set_title("Distribution of min anytime p-value across looks (null)")
ax.set_xlabel("min anytime p-value")
ax.set_ylabel("count")
ax.legend(loc="best")
plt.show()


## Interpretation

- A **confidence sequence** provides a time-uniform band: at any time you stop, the band remains valid at level (1−α).
- **Anytime p-values** remain super-uniform under the null even with optional stopping.

Practical takeaway (v1):
- If you need to monitor continuously (or with flexible looks) and still preserve a frequentist guarantee, use CS/anytime-valid outputs.


## Summary (for article / report)

Fill in placeholders after running:

- Schedule: `{looks}` (K = {K}), α = {alpha}
- Under the null (optional stopping allowed):
  - empirical type I error: **{type1:.3f}**
  - average stop look: **{avg_stop:.0f}**

Claim:
- “Confidence sequences provide anytime-valid uncertainty bands and p-values, enabling safe monitoring and stopping.”


In [None]:
## Commands to extract final numbers for the Summary section
K = len(looks)
print("looks=", looks)
print("K=", K)
print("alpha=", alpha)
print("type1=", type1)
print("avg_stop=", avg_stop)
