We initialize Python imports and opens a DuckDB connection that every later cell reuses. We also load the preferred specs and the modeling frame and set bootstrap parameters so the bootstrap can run 1000 iterations without rewriting code

In [1]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

warnings.filterwarnings("ignore", category=RuntimeWarning)

CWD = Path().resolve()
DB_FILE = None
for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        DB_FILE = cand
        break
if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            DB_FILE = cand
            break
if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb")

con = duckdb.connect(str(DB_FILE), read_only=False)

need = ["step18_model_frame", "step18_preferred_model_specs"]
existing = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())
missing = [t for t in need if t not in existing]
if missing:
    raise RuntimeError(f"Missing tables for step20, {missing}, run notebook 18 first")

df = con.execute("SELECT * FROM step18_model_frame").df()
pref = con.execute("SELECT * FROM step18_preferred_model_specs").df()

TEAM_COL = "team" if "team" in df.columns else "team_key"

pref_def = pref[pref["side"] == "def"].iloc[0].to_dict()
pref_off = pref[pref["side"] == "off"].iloc[0].to_dict()

formula_def = str(pref_def["formula"])
formula_off = str(pref_off["formula"])
fam_def = str(pref_def["family"])
fam_off = str(pref_off["family"])

N_BOOT = 500
SEED = 20260106

print("bootstrap settings")
print("N_BOOT", N_BOOT)
print("SEED", SEED)
print("families", fam_def, fam_off)

bootstrap settings
N_BOOT 500
SEED 20260106
families poisson poisson


We fit the full sample preferred models once with clustered inference to produce standard confidence intervals for comparison against bootstrap intervals

In [2]:
def fit_poisson(formula: str, data: pd.DataFrame):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(maxiter=200, disp=0)
    return r

def fit_nb_discrete(formula: str, data: pd.DataFrame):
    m = smf.negativebinomial(formula=formula, data=data)
    r = m.fit(disp=False, maxiter=200)
    return r

def robust_cluster(res, groups: pd.Series):
    try:
        return res.get_robustcov_results(cov_type="cluster", groups=groups)
    except Exception:
        return None

def standard_ci(res, groups: pd.Series, key_terms: list[str]) -> pd.DataFrame:
    rob = robust_cluster(res, groups)
    if rob is None:
        params = res.params
        se = res.bse
    else:
        params = rob.params
        se = rob.bse

    out = []
    for t in key_terms:
        if t in params.index:
            b = float(params.loc[t])
            s = float(se.loc[t])
            out.append({
                "term": t,
                "beta": b,
                "se_cluster": s,
                "ci_lo": b - 1.96 * s,
                "ci_hi": b + 1.96 * s,
            })
    return pd.DataFrame(out)

key_terms = [
    "shock_nonscore",
    "shock_x_blowout",
    "vol_nonscore_s2d_prior",
    "vol_nonscore_roll4_prior",
    "cum_shocks_nonscore_prior",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]
key_terms = [t for t in key_terms if t in df.columns or t.startswith("ST_Shock") or t.startswith("shock") or t.startswith("vol") or t.startswith("cum")]

groups = df[TEAM_COL]

res_def = None
res_off = None

try:
    res_def = fit_poisson(formula_def, df) if fam_def == "poisson" else fit_nb_discrete(formula_def, df)
except Exception as e:
    raise RuntimeError(f"Full sample defense fit failed, {str(e)}")

try:
    res_off = fit_poisson(formula_off, df) if fam_off == "poisson" else fit_nb_discrete(formula_off, df)
except Exception as e:
    raise RuntimeError(f"Full sample offense fit failed, {str(e)}")

std_def = standard_ci(res_def, groups, key_terms)
std_def["side"] = "def"
std_off = standard_ci(res_off, groups, key_terms)
std_off["side"] = "off"

std_ci_df = pd.concat([std_def, std_off], ignore_index=True)

print("standard clustered intervals for key terms")
print(std_ci_df)

standard clustered intervals for key terms
                        term      beta  se_cluster     ci_lo     ci_hi side
0             shock_nonscore  0.023231    0.030627 -0.036799  0.083261  def
1            shock_x_blowout  0.034795    0.053068 -0.069218  0.138808  def
2   vol_nonscore_roll4_prior -0.009422    0.008052 -0.025204  0.006360  def
3  cum_shocks_nonscore_prior  0.005955    0.010063 -0.013768  0.025678  def
4             shock_nonscore  0.032298    0.032283 -0.030977  0.095574  off
5            shock_x_blowout  0.073520    0.054459 -0.033221  0.180260  off
6   vol_nonscore_roll4_prior  0.023239    0.008285  0.007000  0.039477  off
7  cum_shocks_nonscore_prior -0.010309    0.010524 -0.030937  0.010318  off
