# 17 — Sequential analysis for ratio metrics via linearization

Ratio metrics (e.g., revenue / sessions) are common but statistically tricky:
- The ratio-of-means and mean-of-ratios differ
- Instability can arise from small or zero denominators

In v1 we use a **linearization** approach (delta-method style), then apply the same sequential pipeline as for mean metrics.

We demonstrate:
- Linearization recap
- Sequential monitoring on linearized signal
- Failure mode: denominator zeros / near-zeros

Outputs:
- Look table for the linearized metric
- z-trajectory with sequential boundary (group sequential or CS)
- Practical guardrails


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tecore.sequential.schema import (
    SequentialSpec,
    LookSchedule,
    SequentialConfig,
    SequentialMode,
    SpendingFunction,
    EffectDirection,
)
from tecore.sequential.preprocess import build_look_table_mean
from tecore.sequential.group_sequential import run_group_sequential
from tecore.sequential.confidence_sequences import run_confidence_sequence
from tecore.sequential.ratio import linearize_ratio
from tecore.sequential.simulate import SequentialSimConfig, simulate_ab_stream

plt.rcParams["figure.figsize"] = (9, 5)
plt.rcParams["axes.grid"] = True


## 1) Simulate a ratio stream and linearize


In [None]:
N = 20000
looks = [2000, 4000, 6000, 8000, 12000, 16000, 20000]
alpha = 0.05

# ratio=True => generate num/den columns
df = simulate_ab_stream(SequentialSimConfig(n=N, effect=0.03, noise_sd=1.0, heavy_tail=False, drift=False, seed=41, ratio=True))

print(df.head())
print("den zeros share:", float((df["den"] <= 0).mean()))

# Linearize ratio; baseline fixed by first look (v1)
df_lin, baseline = linearize_ratio(
    df,
    num_col="num",
    den_col="den",
    group_col="group",
    control_label="control",
    baseline_mode="first_look",
    first_look_n=looks[0],
)

print("baseline_ratio(first_look, control)=", baseline)
print(df_lin[["group", "num", "den", "y_lin"]].head())


## 2) Run sequential monitoring on the linearized metric

We show two modes:
- Group sequential (OBF)
- Confidence sequence (anytime-valid)


In [None]:
spec = SequentialSpec(group_col="group", control_label="control", test_label="test", y_col="y_lin", timestamp_col="timestamp")
schedule = LookSchedule(looks=looks)

# Group sequential (OBF)
cfg_gs = SequentialConfig(
    mode=SequentialMode.GROUP_SEQUENTIAL,
    alpha=alpha,
    two_sided=True,
    spending=SpendingFunction.OBRIEN_FLEMING,
    effect_direction=EffectDirection.TWO_SIDED,
    min_n_per_group=200,
    var_floor=1e-12,
    seed=41,
)
lt_gs, warn_gs = build_look_table_mean(df_lin, spec, schedule, cfg_gs)
res_gs = run_group_sequential(lt_gs, cfg_gs)

# Confidence sequence
cfg_cs = SequentialConfig(
    mode=SequentialMode.CONFIDENCE_SEQUENCE,
    alpha=alpha,
    two_sided=True,
    effect_direction=EffectDirection.TWO_SIDED,
    min_n_per_group=200,
    var_floor=1e-12,
    seed=41,
)
lt_cs, warn_cs = build_look_table_mean(df_lin, spec, schedule, cfg_cs)
res_cs = run_confidence_sequence(lt_cs, cfg_cs)

print("GS decision:", res_gs.decision, "stopped=", res_gs.stopped, "stop_look=", res_gs.stop_look)
print("CS decision:", res_cs.decision, "stopped=", res_cs.stopped, "stop_look=", res_cs.stop_look)

display(res_gs.look_table.head())


In [None]:
# Plot: z trajectories with boundaries
tab_gs = res_gs.look_table
x = tab_gs["look_n"].to_numpy(dtype=int)
z_gs = tab_gs["z"].to_numpy(dtype=float)
b_gs = pd.to_numeric(tab_gs.get("boundary_z"), errors="coerce").to_numpy(dtype=float)

tab_cs = res_cs.look_table
z_cs = tab_cs["z"].to_numpy(dtype=float)
b_cs = pd.to_numeric(tab_cs.get("boundary_z"), errors="coerce").to_numpy(dtype=float)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, z_gs, marker="o", label="z (linearized)")
ax.plot(x, b_gs, linestyle="--", label="GS boundary (OBF)")
ax.plot(x, -b_gs, linestyle="--")
ax.axhline(0.0, linewidth=1.0)
ax.set_title("Ratio via linearization: z trajectory vs GS boundary")
ax.set_xlabel("look_n")
ax.set_ylabel("z")
ax.legend(loc="best")
plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, z_cs, marker="o", label="z (linearized)")
ax.plot(x, b_cs, linestyle="--", label="CS anytime boundary")
ax.plot(x, -b_cs, linestyle="--")
ax.axhline(0.0, linewidth=1.0)
ax.set_title("Ratio via linearization: z trajectory vs CS boundary")
ax.set_xlabel("look_n")
ax.set_ylabel("z")
ax.legend(loc="best")
plt.show()


In [None]:
# Plot: effect trajectory + CS band
tab = res_cs.look_table
x = tab["look_n"].to_numpy(dtype=int)
est = tab["diff"].to_numpy(dtype=float)
cs_lo = pd.to_numeric(tab.get("cs_low"), errors="coerce").to_numpy(dtype=float)
cs_hi = pd.to_numeric(tab.get("cs_high"), errors="coerce").to_numpy(dtype=float)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, est, marker="o", label="linearized effect")
ax.fill_between(x, cs_lo, cs_hi, alpha=0.2, label="CS band")
ax.axhline(0.0, linewidth=1.0)
ax.set_title("Linearized ratio effect with anytime-valid confidence sequence")
ax.set_xlabel("look_n")
ax.set_ylabel("linearized effect")
ax.legend(loc="best")
plt.show()


## 3) Failure mode demo: denominator near-zeros

We artificially introduce near-zero denominators to show instability and why we need guardrails.


In [None]:
df_bad = df.copy()

# Inject a small fraction of near-zero denominators
rng = np.random.default_rng(123)
idx = rng.choice(df_bad.index.to_numpy(), size=int(0.01 * len(df_bad)), replace=False)
df_bad.loc[idx, "den"] = 1e-6

df_bad_lin, baseline_bad = linearize_ratio(
    df_bad,
    num_col="num",
    den_col="den",
    group_col="group",
    control_label="control",
    baseline_mode="first_look",
    first_look_n=looks[0],
)

lt_bad, _ = build_look_table_mean(df_bad_lin, spec, schedule, cfg_cs)
res_bad = run_confidence_sequence(lt_bad, cfg_cs)

tab = res_bad.look_table
x = tab["look_n"].to_numpy(dtype=int)
z = tab["z"].to_numpy(dtype=float)
b = pd.to_numeric(tab.get("boundary_z"), errors="coerce").to_numpy(dtype=float)

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(x, z, marker="o", label="z (bad den)")
ax.plot(x, b, linestyle="--", label="CS boundary")
ax.plot(x, -b, linestyle="--")
ax.axhline(0.0, linewidth=1.0)
ax.set_title("Instability demo: near-zero denominators can distort inference")
ax.set_xlabel("look_n")
ax.set_ylabel("z")
ax.legend(loc="best")
plt.show()

print("baseline_bad=", baseline_bad)
print("Decision with bad denominators:", res_bad.decision, "stopped=", res_bad.stopped, "stop_look=", res_bad.stop_look)


## Interpretation

- Linearization converts ratio testing into a mean-difference problem on a constructed variable `y_lin`.
- Sequential monitoring then works as usual (group sequential or CS).
- Denominator pathologies (zeros / near-zeros) can create extreme variance and misleading signals.

Guardrails (v1):
- Track share of `den <= 0` or `den` below a small threshold.
- Consider filtering / winsorization policies for extreme num/den pairs.
- Prefer a stable baseline ratio (e.g., pre-period or first look) to avoid a moving target.


## Summary (for article / report)

Fill in placeholders after running:

- Schedule: `{looks}` (K = {K}), α = {alpha}
- Baseline ratio (control, first look): **{baseline:.6f}**
- Final (CS) stop look: **{stop_look}**; decision: **{decision}**
- Denominator pathology demo:
  - injected near-zero share: 1%
  - decision changed? {changed}

Claim:
- “Ratio metrics require linearization and denominator guardrails; sequential inference should be run on the linearized signal.”


In [None]:
## Commands to extract final numbers for the Summary section
K = len(looks)
print("looks=", looks)
print("K=", K)
print("alpha=", alpha)
print("baseline=", baseline)
print("gs_stop_look=", res_gs.stop_look)
print("gs_decision=", res_gs.decision)
print("cs_stop_look=", res_cs.stop_look)
print("cs_decision=", res_cs.decision)

changed = (res_bad.decision != res_cs.decision) or (res_bad.stop_look != res_cs.stop_look)
print("bad_den_stop_look=", res_bad.stop_look)
print("bad_den_decision=", res_bad.decision)
print("decision_changed=", changed)
