
# A/B Test Sample Size Notebook — One-Sided Two-Proportion Z-Test (H1: p2 > p1)

This notebook helps you design and visualize a one-sided A/B test for conversion rates where the
alternative hypothesis is that the treatment is better than control (H1: p2 > p1).

What you get:
- Per-group sample size via:
  - Manual raw pooled proportions formula
  - Manual Cohen’s h formula
  - statsmodels power solver
- MDE can be absolute (e.g., +0.02) or relative (e.g., +10% uplift of p1)
- Visuals to build intuition:
  - Sample size vs. MDE (absolute)
  - Cohen’s h vs. baseline for a fixed absolute lift
  - Power vs. per-group sample size (n)

Every line of code includes comments explaining what the code does and why (the concept).


In [None]:

# ============================================================
# Imports & Power Solver
# ============================================================

import math                          # math functions (sqrt, asin) for manual formulas
import numpy as np                   # arrays & vectorized math for simulations/plots
import matplotlib.pyplot as plt      # visualization (matplotlib only; no seaborn)

# statsmodels: power analysis engine for common tests
from statsmodels.stats.power import NormalIndPower          # solves for n, power, or effect
from statsmodels.stats.proportion import proportion_effectsize  # computes Cohen's h given p1, p2

# Try to import accurate Normal quantiles; fall back if SciPy isn't available
try:
    from scipy.stats import norm                              # normal distribution utilities (ppf = inverse CDF)
    def z_quantile(prob):                                     # helper to get z so that P(Z<=z)=prob for Z~N(0,1)
        return norm.ppf(prob)                                 # precise quantile from SciPy
except Exception:
    def z_quantile(p):                                        # approximate inverse CDF for standard normal
        return math.sqrt(2) * math.erfinv(2*p - 1)            # decent approximation; SciPy recommended for accuracy

# Instantiate a single power solver (reused to avoid reallocation)
POWER_SOLVER = NormalIndPower()                               # engine implementing normal-approx power formulas


In [None]:

# ============================================================
# Core Functions: p2 from MDE, Sample Size Formulas, Wrapper
# ============================================================

def resolve_p2_from_mde(p1, mde, mde_type="absolute"):
    """
    Determine p2 from baseline p1 and an MDE specification.

    Parameters
    ----------
    p1 : float in (0,1)                # baseline conversion rate (historical estimate)
    mde : float                         # minimal uplift worth detecting (abs diff or relative)
    mde_type : 'absolute'|'relative'    # interpretation of mde (Delta p vs. % uplift of p1)

    Returns
    -------
    p2 : float in (0,1)                 # treatment conversion rate implied by MDE
    """
    # Concept: MDE is a business decision -> the smallest effect worth acting on.
    if mde_type == "absolute":                                 # e.g., +0.02 means +2 percentage points
        p2 = p1 + mde                                          # absolute difference target
    elif mde_type == "relative":                               # e.g., +0.10 means +10% uplift *of p1*
        p2 = p1 * (1 + mde)                                    # relative increase target
    else:
        raise ValueError("mde_type must be 'absolute' or 'relative'")  # guard against typos

    # Numerical safety: avoid exact 0 or 1 (variance terms break at boundaries)
    return float(np.clip(p2, 1e-8, 1 - 1e-8))                  # keep inside open interval (0,1)


def n_per_group_raw(p1, p2, alpha=0.05, power=0.80):
    """
    One-sided two-proportion z-test (equal n per group), manual closed form using
    z_{1-alpha} for one-sided tests:

    n ~= [ z_{1-alpha} * sqrt(2 * p * (1 - p)) + z_{1-beta} * sqrt(p1(1 - p1) + p2(1 - p2)) ]^2 / (p1 - p2)^2

    where p = (p1 + p2)/2 is the pooled proportion.

    Concepts:
    - z_{1-alpha} (one-sided) controls Type I error in a single tail (H1: p2 > p1).
    - z_{1-beta} targets the desired Power = 1-beta (prob of detecting the effect).
    - Pooled term reflects H0 assumption (same rate), stabilizing variance estimate.
    """
    z_alpha_one_sided = z_quantile(1 - alpha)                  # one-sided cutoff for significance level alpha
    z_power        = z_quantile(power)                         # cutoff that delivers desired power 1-beta
    p_pool         = 0.5 * (p1 + p2)                           # pooled conversion rate under H0 assumption
    term_null      = math.sqrt(2 * p_pool * (1 - p_pool))      # variability under null (pooled)
    term_alt       = math.sqrt(p1*(1 - p1) + p2*(1 - p2))      # variability under alternative (true p1, p2)
    diff           = abs(p2 - p1)                              # effect magnitude we want to detect
    n              = ((z_alpha_one_sided*term_null + z_power*term_alt) ** 2) / (diff ** 2)  # closed-form n
    return n                                                   # per-group sample size (float; round up in planning)


def cohens_h(p1, p2):
    """
    Cohen's h = 2 * (arcsin(sqrt(p1)) - arcsin(sqrt(p2))).

    Concept:
    - arcsin(sqrt(.)) variance-stabilizing transform on [0,1].
    - Equal absolute differences near 0.5 "weigh more" than near 0 or 1.
    """
    return 2.0 * (math.asin(math.sqrt(p1)) - math.asin(math.sqrt(p2)))  # signed effect; take abs for magnitude


def n_per_group_h(p1, p2, alpha=0.05, power=0.80):
    """
    Approx per-group n using Cohen's h for two independent proportions (equal n),
    one-sided test uses z_{1-alpha} instead of z_{1-alpha/2}:

    n ~= 2 * (z_{1-alpha} + z_{1-beta})^2 / h^2
    """
    z_alpha_one_sided = z_quantile(1 - alpha)                  # one-sided critical
    z_power        = z_quantile(power)                         # z for achieving target power
    h              = abs(cohens_h(p1, p2))                     # standardized effect magnitude (positive)
    n              = 2.0 * ((z_alpha_one_sided + z_power) ** 2) / (h ** 2)  # closed-form using h
    return n                                                   # per-group sample size (float)


def n_per_group_statsmodels(p1, p2, alpha=0.05, power=0.80, alternative="larger"):
    """
    Per-group n solved by statsmodels NormalIndPower.
    Internally converts (p1, p2) -> Cohen's h (via proportion_effectsize).

    Parameters:
    - alternative: for one-sided H1: p2 > p1, use 'larger'.
    """
    effect_size = proportion_effectsize(p1, p2)                # compute Cohen's h from raw proportions
    n = POWER_SOLVER.solve_power(effect_size=abs(effect_size), # solve for n (per group) given effect & targets
                                 alpha=alpha,                  # significance level (Type I error control)
                                 power=power,                  # desired power (1-beta)
                                 ratio=1.0,                    # equal group sizes (optimal)
                                 alternative=alternative)      # 'larger' for one-sided p2 > p1
    return n                                                   # per-group sample size (float)


def compute_sample_sizes(p1, mde, mde_type="absolute", alpha=0.05, power=0.80, alternative="larger"):
    """
    Convenience wrapper that:
    - Resolves p2 from p1 + MDE
    - Returns a dict with all sample size methods + recommended ceil (one-sided design)
    """
    p2 = resolve_p2_from_mde(p1, mde, mde_type=mde_type)       # translate business MDE into a target p2
    n_raw      = n_per_group_raw(p1, p2, alpha=alpha, power=power)        # manual pooled-variance formula
    n_h        = n_per_group_h(p1, p2, alpha=alpha, power=power)          # manual Cohen's h formula
    n_sm       = n_per_group_statsmodels(p1, p2, alpha=alpha, power=power, alternative=alternative)  # statsmodels
    n_reco     = math.ceil(max(n_raw, n_h, n_sm))              # planning tip: choose the max and round UP
    return {"p1": p1, "p2": p2, "alpha": alpha, "power": power, "alternative": alternative,
            "n_raw": n_raw, "n_h": n_h, "n_statsmodels": n_sm, "n_recommended_per_group": n_reco}


In [None]:

# ============================================================
# Visualization Helpers (Matplotlib only; one plot per figure)
# ============================================================

def plot_n_vs_mde(p1, alpha=0.05, power=0.80, mde_min=0.005, mde_max=0.10, points=60):
    """
    Plot how per-group n explodes as MDE shrinks (absolute MDE).
    Concept: Detecting tiny effects requires large samples — business tradeoff.
    """
    mdes = np.linspace(mde_min, mde_max, points)               # grid of absolute MDEs (e.g., 0.5pp -> 10pp)
    n_raw_curve, n_h_curve = [], []                            # containers for each method's n

    for m in mdes:                                             # iterate over MDE values
        p2 = resolve_p2_from_mde(p1, m, mde_type="absolute")   # convert abs MDE to p2
        n_raw_curve.append(n_per_group_raw(p1, p2, alpha, power))     # compute raw formula n
        n_h_curve.append(n_per_group_h(p1, p2, alpha, power))         # compute h-based n

    plt.figure(figsize=(8, 5))                                 # single figure, readable size
    plt.plot(mdes*100, n_raw_curve, label="Manual Raw (pooled)")      # plot raw curve vs % points
    plt.plot(mdes*100, n_h_curve, linestyle="--", label="Manual Cohen's h")  # plot h curve dashed
    plt.xlabel("MDE (absolute percentage points)")             # x-axis explains unit (pp)
    plt.ylabel("Per-group sample size (n)")                    # y-axis is the metric of interest
    plt.title(f"Sample Size vs. MDE (Baseline p1={p1:.0%}, alpha={alpha}, Power={power}, one-sided)")  # title
    plt.legend()                                               # legend to distinguish curves
    plt.grid(True, alpha=0.3)                                  # light grid for readability
    plt.tight_layout()                                         # tidy layout to avoid cut-offs
    plt.show()                                                 # render the plot


def plot_h_vs_baseline(abs_diff=0.05, p1_min=0.05, p1_max=0.95, points=200):
    """
    Show how the same absolute difference maps to different standardized effect sizes (h)
    across baselines — the nonlinearity on the 0–1 probability scale.

    Concept: 5pp at 10% is a much larger standardized shift than 5pp at 90%.
    """
    p1_grid = np.linspace(p1_min, p1_max, points)              # grid of baselines
    h_vals  = []                                               # holder for |h| magnitudes

    for p1 in p1_grid:                                         # loop over baselines
        p2 = resolve_p2_from_mde(p1, abs_diff, "absolute")     # apply fixed absolute difference
        h_vals.append(abs(cohens_h(p1, p2)))                   # compute |h| and store

    plt.figure(figsize=(8, 5))                                 # single figure
    plt.plot(p1_grid*100, h_vals)                              # plot |h| vs baseline (%)
    plt.xlabel("Baseline p1 (%)")                              # x-label in percent to be intuitive
    plt.ylabel("Cohen's h (|standardized effect|)")            # y-label explains metric
    plt.title(f"Cohen's h vs Baseline (Absolute diff = {abs_diff*100:.1f} pp)")  # informative title
    plt.grid(True, alpha=0.3)                                  # subtle grid
    plt.tight_layout()                                         # tidy layout
    plt.show()                                                 # render the plot


def plot_power_vs_n(p1, mde, mde_type="absolute", alpha=0.05,
                    n_min=50, n_max=5000, step=50,
                    alternative="larger", target_power=0.80):
    """
    Plot statistical power as a function of per-group sample size (n)
    for a two-proportion z-test, keeping p1/p2 (i.e., the effect) fixed.

    Concept:
    - Power rises with n: more data -> easier to detect the same true effect.
    - This curve helps choose a practical n that meets your power target.
    - One-sided: 'alternative' should be 'larger' for H1: p2 > p1.
    """
    p2 = resolve_p2_from_mde(p1, mde, mde_type=mde_type)       # fix effect by resolving p2
    h = proportion_effectsize(p1, p2)                          # convert raw proportions into Cohen's h
    n_grid = np.arange(n_min, n_max + 1, step)                 # candidate per-group sample sizes

    powers = []                                                # store power for each n
    for n in n_grid:                                           # iterate over grid
        pwr = POWER_SOLVER.power(effect_size=abs(h),           # magnitude of effect (Cohen's h)
                                 nobs1=n,                      # per-group sample size
                                 alpha=alpha,                  # Type I error rate
                                 ratio=1.0,                    # equal group sizes
                                 alternative=alternative)      # 'larger' for one-sided H1
        powers.append(pwr)                                     # collect computed power

    plt.figure(figsize=(8, 5))                                 # single chart
    plt.plot(n_grid, powers, label="Power vs. per-group n")    # power curve
    plt.axhline(y=target_power, linestyle="--",                # reference line at target power
                label=f"Target power = {target_power:.2f}")
    plt.xlabel("Per-group sample size (n)")                    # x-axis label
    plt.ylabel("Power (1 - beta)")                             # y-axis label
    p2_pct = f"{p2:.0%}"                                       # pretty p2 percentage for title
    plt.title(f"Power Curve (p1={p1:.0%}, p2={p2_pct}, alpha={alpha}, one-sided)")  # informative title
    plt.grid(True, alpha=0.3)                                  # readability
    plt.legend()                                               # legend
    plt.tight_layout()                                         # tidy layout
    plt.show()                                                 # render


## Configure & Run

In [None]:

# ============================================================
# CONFIGURATION (EDIT THESE)
# ============================================================
p1_cfg       = 0.10            # baseline conversion (e.g., 10%)
mde_cfg      = 0.05            # desired MDE: if 'absolute' -> 0.05 = +5 percentage points
mde_type_cfg = "absolute"      # 'absolute' or 'relative' (e.g., 0.05 = +5% uplift of p1 if 'relative')
alpha_cfg    = 0.05            # significance level (one-sided uses z_{1-alpha})
power_cfg    = 0.80            # desired power
alt_cfg      = "larger"        # ONE-SIDED: H1: p2 > p1 -> use 'larger'

# ============================================================
# RUN: compute sample sizes & show visuals
# ============================================================
results = compute_sample_sizes(p1=p1_cfg, mde=mde_cfg, mde_type=mde_type_cfg,
                               alpha=alpha_cfg, power=power_cfg, alternative=alt_cfg)  # compute all methods

# Report results (planning tip: round UP per group)
print("=== DESIGN INPUTS ===")                               # header for clarity
print(f"Baseline p1:            {results['p1']:.4f}")        # echo p1
print(f"Treatment p2 (target):  {results['p2']:.4f}")        # implied p2 from MDE
print(f"Alpha:                  {results['alpha']}")         # alpha shown explicitly
print(f"Power:                  {results['power']}")         # power shown explicitly
print(f"Alternative:            {results['alternative']}")   # one-sided vs two-sided

print("\n=== PER-GROUP SAMPLE SIZE (n) ===")                 # header
print(f"Manual: Raw pooled      {results['n_raw']:.2f}")     # manual raw formula
print(f"Manual: Cohen's h       {results['n_h']:.2f}")       # manual h formula
print(f"statsmodels (h inside)  {results['n_statsmodels']:.2f}")  # statsmodels solver
print(f"\nRECOMMENDED (ceil max): {results['n_recommended_per_group']} per group")  # conservative pick

# Visual 1: Sample size vs absolute MDE (intuition on trade-offs)
plot_n_vs_mde(p1=p1_cfg, alpha=alpha_cfg, power=power_cfg, mde_min=0.005, mde_max=0.10, points=60)  # curve

# Visual 2: Cohen's h vs baseline for a fixed absolute difference
plot_h_vs_baseline(abs_diff=0.05, p1_min=0.05, p1_max=0.95, points=200)  # nonlinearity demo

# Visual 3: Power vs n (for your one-sided H1: p2 > p1)
plot_power_vs_n(p1=p1_cfg, mde=mde_cfg, mde_type=mde_type_cfg,
                alpha=alpha_cfg, n_min=50, n_max=5000, step=50,
                alternative=alt_cfg, target_power=power_cfg)
