# 15 — Group sequential boundaries: Pocock vs O’Brien–Fleming

This notebook compares two classic group sequential designs:
- **O’Brien–Fleming (OBF)**: very strict early, close to fixed-horizon late
- **Pocock**: more permissive early, more conservative late

We show:
- Boundaries across looks (z-critical)
- One example stream: z-trajectory with both boundaries
- Monte Carlo comparison: stop times and rejection rates


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tecore.sequential.schema import (
    SequentialSpec,
    LookSchedule,
    SequentialConfig,
    SequentialMode,
    SpendingFunction,
    EffectDirection,
)
from tecore.sequential.preprocess import build_look_table_mean
from tecore.sequential.group_sequential import run_group_sequential
from tecore.sequential.simulate import SequentialSimConfig, simulate_ab_stream

plt.rcParams["figure.figsize"] = (9, 5)
plt.rcParams["axes.grid"] = True


## 1) Boundary shapes across looks

We compute per-look z-critical values under both spending functions, using the same schedule.


In [None]:
N = 10000
looks = [1000, 2000, 3000, 4000, 6000, 8000, 10000]
alpha = 0.05

df0 = simulate_ab_stream(SequentialSimConfig(n=N, effect=0.0, noise_sd=1.0, heavy_tail=False, drift=False, seed=1, ratio=False))

spec = SequentialSpec(group_col="group", control_label="control", test_label="test", y_col="y", timestamp_col="timestamp")
schedule = LookSchedule(looks=looks)

cfg_obf = SequentialConfig(
    mode=SequentialMode.GROUP_SEQUENTIAL,
    alpha=alpha,
    two_sided=True,
    spending=SpendingFunction.OBRIEN_FLEMING,
    effect_direction=EffectDirection.TWO_SIDED,
    min_n_per_group=50,
)
cfg_poc = SequentialConfig(
    mode=SequentialMode.GROUP_SEQUENTIAL,
    alpha=alpha,
    two_sided=True,
    spending=SpendingFunction.POCOCK,
    effect_direction=EffectDirection.TWO_SIDED,
    min_n_per_group=50,
)

lt0, _ = build_look_table_mean(df0, spec, schedule, cfg_obf)
res_obf = run_group_sequential(lt0, cfg_obf)

lt1, _ = build_look_table_mean(df0, spec, schedule, cfg_poc)
res_poc = run_group_sequential(lt1, cfg_poc)

b_obf = res_obf.look_table["boundary_z"].to_numpy(dtype=float)
b_poc = res_poc.look_table["boundary_z"].to_numpy(dtype=float)
x = res_obf.look_table["look_n"].to_numpy(dtype=int)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, b_obf, marker="o", label="OBF boundary")
ax.plot(x, b_poc, marker="o", label="Pocock boundary")
ax.set_title("Group sequential boundaries across looks")
ax.set_xlabel("look_n")
ax.set_ylabel("z-critical")
ax.legend(loc="best")
plt.show()

display(pd.DataFrame({"look_n": x, "z_obf": b_obf, "z_pocock": b_poc}))


## 2) One stream: z-trajectory vs both boundaries

We simulate a modest effect and overlay z with both boundaries.


In [None]:
df = simulate_ab_stream(SequentialSimConfig(n=N, effect=0.08, noise_sd=1.0, heavy_tail=False, drift=False, seed=11, ratio=False))

lt, _ = build_look_table_mean(df, spec, schedule, cfg_obf)
res_obf = run_group_sequential(lt, cfg_obf)

lt, _ = build_look_table_mean(df, spec, schedule, cfg_poc)
res_poc = run_group_sequential(lt, cfg_poc)

x = res_obf.look_table["look_n"].to_numpy(dtype=int)
z = res_obf.look_table["z"].to_numpy(dtype=float)
b_obf = res_obf.look_table["boundary_z"].to_numpy(dtype=float)
b_poc = res_poc.look_table["boundary_z"].to_numpy(dtype=float)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, z, marker="o", label="z trajectory")
ax.plot(x, b_obf, linestyle="--", label="OBF boundary")
ax.plot(x, -b_obf, linestyle="--")
ax.plot(x, b_poc, linestyle=":", label="Pocock boundary")
ax.plot(x, -b_poc, linestyle=":")
ax.axhline(0.0, linewidth=1.0)
ax.set_title("Example stream: z trajectory vs OBF/Pocock")
ax.set_xlabel("look_n")
ax.set_ylabel("z")
ax.legend(loc="best")
plt.show()

print("OBF:", res_obf.decision, "stopped=", res_obf.stopped, "stop_look=", res_obf.stop_look)
print("Pocock:", res_poc.decision, "stopped=", res_poc.stopped, "stop_look=", res_poc.stop_look)


## 3) Monte Carlo: rejection rate and stop time distribution

We compare OBF and Pocock under a fixed effect.

Metrics:
- rejection rate (power proxy)
- average stop look (earliness)
- stop time distribution (histogram)


In [None]:
n_sims = 250
effect = 0.06

obf_reject = []
poc_reject = []
obf_stop = []
poc_stop = []

for s in range(n_sims):
    df = simulate_ab_stream(SequentialSimConfig(n=N, effect=effect, noise_sd=1.0, heavy_tail=False, drift=False, seed=2000 + s, ratio=False))

    lt, _ = build_look_table_mean(df, spec, schedule, cfg_obf)
    res = run_group_sequential(lt, cfg_obf)
    obf_reject.append(res.decision == "reject")
    obf_stop.append(res.stop_look if res.stop_look is not None else looks[-1])

    lt, _ = build_look_table_mean(df, spec, schedule, cfg_poc)
    res = run_group_sequential(lt, cfg_poc)
    poc_reject.append(res.decision == "reject")
    poc_stop.append(res.stop_look if res.stop_look is not None else looks[-1])

obf_rej_rate = float(np.mean(obf_reject))
poc_rej_rate = float(np.mean(poc_reject))
obf_stop_avg = float(np.mean(obf_stop))
poc_stop_avg = float(np.mean(poc_stop))

print("Rejection rate (OBF):", obf_rej_rate)
print("Rejection rate (Pocock):", poc_rej_rate)
print("Avg stop look (OBF):", obf_stop_avg)
print("Avg stop look (Pocock):", poc_stop_avg)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(obf_stop, bins=len(looks), alpha=0.7, label="OBF")
ax.hist(poc_stop, bins=len(looks), alpha=0.7, label="Pocock")
ax.set_title("Stop look distribution")
ax.set_xlabel("stop_look")
ax.set_ylabel("count")
ax.legend(loc="best")
plt.show()


## Interpretation

- OBF typically **requires stronger early evidence**, so it is less likely to stop very early on weak signals; it behaves close to a fixed-horizon test near the final look.
- Pocock tends to be **more permissive early**, which may yield earlier stopping when effects are real, but it pays by being more conservative at later looks.

Practical guidance:
- Prefer **OBF** when you want early stopping to be rare unless evidence is very strong.
- Prefer **Pocock** when you value earlier decisions and accept a more conservative final threshold.


## Summary (for article / report)

Fill in the placeholders after running:

- Schedule: `{looks}` (K = {K}), α = {alpha}
- Effect used in simulation: {effect}
- Rejection rate:
  - OBF: **{obf_rej_rate:.3f}**
  - Pocock: **{poc_rej_rate:.3f}**
- Average stop look:
  - OBF: **{obf_stop_avg:.0f}**
  - Pocock: **{poc_stop_avg:.0f}**

Claim:
- “OBF is stricter early and closer to fixed-horizon late; Pocock is more permissive early but more conservative late.”


In [None]:
## Commands to extract final numbers for the Summary section
K = len(looks)
print("looks=", looks)
print("K=", K)
print("alpha=", alpha)
print("effect=", effect)
print("obf_rej_rate=", obf_rej_rate)
print("poc_rej_rate=", poc_rej_rate)
print("obf_stop_avg=", obf_stop_avg)
print("poc_stop_avg=", poc_stop_avg)
