We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [5]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db or notebook parent folders to contain it")

con = duckdb.connect(str(DB_FILE), read_only=False)

need_tables = [
    "step18_model_frame",
    "step18_preferred_model_specs",
]

existing = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())
missing = [t for t in need_tables if t not in existing]

print("connected db", str(DB_FILE))
print("missing step 21 inputs", missing)
if missing:
    raise RuntimeError("Missing step 21 inputs, rerun step 18 first")


connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
missing step 21 inputs []


We load the model frame and preferred specs then standardize key column names and verifies required outcomes are present

In [2]:
df = con.execute("SELECT * FROM step18_model_frame").df()
pref = con.execute("SELECT * FROM step18_preferred_model_specs").df()

TEAM_COL = "team" if "team" in df.columns else ("team_key" if "team_key" in df.columns else None)
if TEAM_COL is None:
    raise RuntimeError("Missing team identifier column in step18_model_frame")

SEASON_COL = "season"
WEEK_COL = "week"

req_outcomes = ["Inj_Off_Next_w", "Inj_Def_Next_w"]
for c in req_outcomes:
    if c not in df.columns:
        raise RuntimeError(f"Missing {c} in step18_model_frame, rerun step 16 and step 18")

if "season_week" not in df.columns:
    df["season_week"] = (df[SEASON_COL].astype(int) * 100 + df[WEEK_COL].astype(int)).astype(int)

if "load_nonscore" not in df.columns:
    raise RuntimeError("Missing load_nonscore in step18_model_frame, rerun step 18")

if "shock_nonscore" not in df.columns:
    raise RuntimeError("Missing shock_nonscore in step18_model_frame, rerun step 18")

if "blowout_flag_w" not in df.columns:
    raise RuntimeError("Missing blowout_flag_w in step18_model_frame, rerun step 10 onward and rebuild step 16 then step 18")

if "shock_x_blowout" not in df.columns:
    df["shock_x_blowout"] = (df["shock_nonscore"].astype(int) * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

if "Any_Off_Injury_Next_w" not in df.columns:
    df["Any_Off_Injury_Next_w"] = (df["Inj_Off_Next_w"].astype(float) > 0).astype(int)

if "Any_Def_Injury_Next_w" not in df.columns:
    df["Any_Def_Injury_Next_w"] = (df["Inj_Def_Next_w"].astype(float) > 0).astype(int)

print("rows step18_model_frame", len(df))
print("team col", TEAM_COL)
print("preferred specs rows", len(pref))
print(pref[["side", "outcome", "family", "spec_id"]])

rows step18_model_frame 5950
team col team
preferred specs rows 2
  side         outcome   family                 spec_id
0  def  Inj_Def_Next_w  poisson  nonscore_roll4_no_lags
1  off  Inj_Off_Next_w  poisson  nonscore_roll4_no_lags


We construct all robustness variants in a no lookahead safe way where possible and then create lead and lag variables needed for timing and placebo tests

In [3]:
df = df.copy()
df[TEAM_COL] = df[TEAM_COL].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df = df.sort_values([TEAM_COL, SEASON_COL, WEEK_COL]).reset_index(drop=True)
g = df.groupby([TEAM_COL, SEASON_COL], sort=False)

mean_s2d_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().mean().shift(1))
    .reset_index(level=[0, 1], drop=True)
)
sd_s2d_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().std(ddof=1).shift(1))
    .reset_index(level=[0, 1], drop=True)
)

z_s2d_prior = (df["load_nonscore"] - mean_s2d_prior) / sd_s2d_prior

df["shock_nonscore_z05"] = (z_s2d_prior >= 0.5).fillna(False).astype(int)
df["shock_x_blowout_z05"] = (df["shock_nonscore_z05"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

q75_full = g["load_nonscore"].transform(lambda s: float(s.quantile(0.75)) if len(s) else np.nan)
df["shock_nonscore_top25_fullseason"] = (df["load_nonscore"] >= q75_full).fillna(False).astype(int)
df["shock_x_blowout_top25_fullseason"] = (df["shock_nonscore_top25_fullseason"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

q75_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().quantile(0.75).shift(1))
    .reset_index(level=[0, 1], drop=True)
)
df["shock_nonscore_top25_prior"] = (df["load_nonscore"] >= q75_prior).fillna(False).astype(int)
df["shock_x_blowout_top25_prior"] = (df["shock_nonscore_top25_prior"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

if "vol_nonscore_s2d_prior" not in df.columns or "vol_nonscore_roll4_prior" not in df.columns:
    raise RuntimeError("Missing vol_nonscore_s2d_prior or vol_nonscore_roll4_prior in step18_model_frame, rerun step 18")

df["cv_nonscore_s2d_prior"] = (df["vol_nonscore_s2d_prior"].astype(float) / mean_s2d_prior.astype(float)).replace([np.inf, -np.inf], np.nan).fillna(0.0).astype(float)

mean_roll4_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.shift(1).rolling(4, min_periods=2).mean())
    .reset_index(level=[0, 1], drop=True)
).fillna(mean_s2d_prior)

df["cv_nonscore_roll4_prior"] = (df["vol_nonscore_roll4_prior"].astype(float) / mean_roll4_prior.astype(float)).replace([np.inf, -np.inf], np.nan).fillna(0.0).astype(float)

df["Inj_Off_Next2_w"] = g["Inj_Off_Next_w"].shift(-1)
df["Inj_Def_Next2_w"] = g["Inj_Def_Next_w"].shift(-1)

if "Inj_Off_Last_w" in df.columns:
    df["Inj_Off_Prev_w"] = g["Inj_Off_Last_w"].shift(1)
else:
    df["Inj_Off_Prev_w"] = g["Inj_Off_Next_w"].shift(1)

if "Inj_Def_Last_w" in df.columns:
    df["Inj_Def_Prev_w"] = g["Inj_Def_Last_w"].shift(1)
else:
    df["Inj_Def_Prev_w"] = g["Inj_Def_Next_w"].shift(1)

lead_exposure_cols = [
    "shock_nonscore",
    "shock_x_blowout",
    "vol_nonscore_s2d_prior",
    "vol_nonscore_roll4_prior",
    "cum_shocks_nonscore_prior",
]
for c in lead_exposure_cols:
    if c in df.columns:
        df[f"{c}_lead1"] = g[c].shift(-1)
        df[f"{c}_lead2"] = g[c].shift(-2)

con.register("step21_frame_tmp", df)
con.execute("CREATE OR REPLACE TABLE step21_frame AS SELECT * FROM step21_frame_tmp")
con.unregister("step21_frame_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)
df.to_csv(out_dir / "step21_frame.csv", index=False)

print("wrote duckdb table step21_frame")
print("wrote csv outputs step21_frame.csv")

wrote duckdb table step21_frame
wrote csv outputs step21_frame.csv


Quick sanity check to confirm that the new robustness fields exist, have plausible ranges, and that lead outcomes are only missing at season edges

In [4]:
check_cols = [
    "shock_nonscore_z05",
    "shock_nonscore_top25_fullseason",
    "shock_nonscore_top25_prior",
    "cv_nonscore_s2d_prior",
    "cv_nonscore_roll4_prior",
    "Inj_Off_Next2_w",
    "Inj_Def_Next2_w",
    "Inj_Off_Prev_w",
    "Inj_Def_Prev_w",
]
missing = [c for c in check_cols if c not in df.columns]
print("missing check cols", missing)
if missing:
    raise RuntimeError("Missing step 21 derived columns, rerun step 21 cell 3")

summary = pd.DataFrame({
    "col": check_cols,
    "nulls": [int(df[c].isna().sum()) for c in check_cols],
    "min": [float(np.nanmin(df[c].astype(float))) for c in check_cols],
    "max": [float(np.nanmax(df[c].astype(float))) for c in check_cols],
    "mean": [float(np.nanmean(df[c].astype(float))) for c in check_cols],
})
print(summary)

edge_missing = df.groupby([SEASON_COL, TEAM_COL], sort=False)[["Inj_Off_Next2_w", "Inj_Def_Next2_w"]].apply(lambda x: int(x.isna().sum().sum()))
print("total lead2 missing per team season, expected small and concentrated at end weeks")
print(edge_missing.describe())

missing check cols []
                               col  nulls  min        max      mean
0               shock_nonscore_z05      0  0.0   1.000000  0.272269
1  shock_nonscore_top25_fullseason      0  0.0   1.000000  0.355126
2       shock_nonscore_top25_prior      0  0.0   1.000000  0.365882
3            cv_nonscore_s2d_prior      0  0.0   0.633958  0.168143
4          cv_nonscore_roll4_prior      0  0.0   0.633958  0.164322
5                  Inj_Off_Next2_w    416  0.0   8.000000  1.927720
6                  Inj_Def_Next2_w    416  0.0  10.000000  2.088182
7                   Inj_Off_Prev_w    416  0.0   9.000000  1.637152
8                   Inj_Def_Prev_w    416  0.0  10.000000  1.780990
total lead2 missing per team season, expected small and concentrated at end weeks
count    416.0
mean       2.0
std        0.0
min        2.0
25%        2.0
50%        2.0
75%        2.0
max        2.0
dtype: float64


We define reusable helpers to rewrite formulas, fit count models with clustered standard errors, and extract tidy coefficient outputs for baseline and robustness variants

In [7]:
def _rewrite_formula(formula: str, rename_map: dict) -> str:
    out = str(formula)
    for old, new in rename_map.items():
        if old == new:
            continue
        pat = rf"(?<![A-Za-z0-9_]){re.escape(old)}(?![A-Za-z0-9_])"
        out = re.sub(pat, new, out)
    return out

def _get_outcome(formula: str) -> str:
    return str(formula).split("~", 1)[0].strip()

def _get_rhs_terms(formula: str) -> list[str]:
    rhs = str(formula).split("~", 1)[1]
    parts = [p.strip() for p in rhs.split("+")]
    return [p for p in parts if p]

def _fit_count(formula: str, family: str, data: pd.DataFrame, cluster_col: str):
    fam = str(family).lower()
    if fam == "poisson":
        m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
        r = m.fit(cov_type="cluster", cov_kwds={"groups": data[cluster_col]})
        return r
    if fam in ["negative_binomial", "negativebinomial", "nb"]:
        m = smf.negativebinomial(formula=formula, data=data)
        base = m.fit(disp=False, maxiter=200)
        r = base.get_robustcov_results(cov_type="cluster", groups=data[cluster_col])
        return r
    raise RuntimeError(f"Unknown family {family}")

def _tidy_count(res, model_name: str, side: str, spec_variant: str, family: str, key_terms: list[str]) -> pd.DataFrame:
    params = res.params.copy()
    bse = res.bse.copy()
    pvals = res.pvalues.copy()

    out = pd.DataFrame({
        "model": str(model_name),
        "side": str(side),
        "spec_variant": str(spec_variant),
        "family": str(family),
        "outcome": str(getattr(res.model, "endog_names", "")),
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": bse.values.astype(float),
        "pvalue": pvals.values.astype(float),
    })

    out["irr"] = np.exp(out["beta"].astype(float))
    out["irr_ci_lo"] = np.exp(out["beta"].astype(float) - 1.96 * out["se_cluster"].astype(float))
    out["irr_ci_hi"] = np.exp(out["beta"].astype(float) + 1.96 * out["se_cluster"].astype(float))

    keep = set(key_terms)
    out["is_key_term"] = out["term"].apply(lambda x: 1 if x in keep else 0)

    out["nobs"] = int(getattr(res, "nobs", np.nan))
    out["aic"] = float(getattr(res, "aic", np.nan))
    out["bic"] = float(getattr(res, "bic", np.nan))
    return out

def _infer_term_present(formula: str, term: str) -> bool:
    pat = rf"(?<![A-Za-z0-9_]){re.escape(term)}(?![A-Za-z0-9_])"
    return re.search(pat, str(formula)) is not None

We refit the preferred models under alternative shock and volatility definitions then write tidy robustness outputs to DuckDB and CSV

In [8]:
df21 = con.execute("SELECT * FROM step21_frame").df()
df21[TEAM_COL] = df21[TEAM_COL].astype(str)

def _pick_row(pref_df: pd.DataFrame, side: str) -> pd.Series:
    r = pref_df[pref_df["side"].astype(str) == str(side)]
    if len(r) != 1:
        raise RuntimeError(f"Expected exactly one preferred spec row for side {side}")
    return r.iloc[0]

pref_def = _pick_row(pref, "def")
pref_off = _pick_row(pref, "off")

base_formula_def = str(pref_def["formula"])
base_formula_off = str(pref_off["formula"])

base_family_def = str(pref_def["family"])
base_family_off = str(pref_off["family"])

def _detect_vol_term(formula: str) -> str | None:
    candidates = ["vol_nonscore_roll4_prior", "vol_nonscore_s2d_prior", "ST_Vol_NonScore_w"]
    for c in candidates:
        if _infer_term_present(formula, c):
            return c
    return None

def _detect_cum_term(formula: str) -> str | None:
    candidates = ["cum_shocks_nonscore_prior", "Cum_Shocks_NonScore_w"]
    for c in candidates:
        if _infer_term_present(formula, c):
            return c
    return None

def _detect_shock_terms(formula: str) -> tuple[str | None, str | None]:
    shock = None
    inter = None
    for c in ["shock_nonscore", "ST_Shock_NonScore_w"]:
        if _infer_term_present(formula, c):
            shock = c
            break
    for c in ["shock_x_blowout", "shock_x_blowout_nonscore", "shock_x_blowout_all", "shock_x_blowout_scorelinked"]:
        if _infer_term_present(formula, c):
            inter = c
            break
    return shock, inter

base_shock_def, base_inter_def = _detect_shock_terms(base_formula_def)
base_shock_off, base_inter_off = _detect_shock_terms(base_formula_off)

base_vol_def = _detect_vol_term(base_formula_def)
base_vol_off = _detect_vol_term(base_formula_off)

base_cum_def = _detect_cum_term(base_formula_def)
base_cum_off = _detect_cum_term(base_formula_off)

if base_shock_def is None or base_inter_def is None or base_vol_def is None or base_cum_def is None:
    raise RuntimeError("Could not detect key exposure terms in preferred defense formula, inspect step18_preferred_model_specs")

if base_shock_off is None or base_inter_off is None or base_vol_off is None or base_cum_off is None:
    raise RuntimeError("Could not detect key exposure terms in preferred offense formula, inspect step18_preferred_model_specs")

robust_variants = [
    {
        "variant": "baseline",
        "rename_def": {},
        "rename_off": {},
    },
    {
        "variant": "shock_z05",
        "rename_def": {base_shock_def: "shock_nonscore_z05", base_inter_def: "shock_x_blowout_z05"},
        "rename_off": {base_shock_off: "shock_nonscore_z05", base_inter_off: "shock_x_blowout_z05"},
    },
    {
        "variant": "shock_top25_prior",
        "rename_def": {base_shock_def: "shock_nonscore_top25_prior", base_inter_def: "shock_x_blowout_top25_prior"},
        "rename_off": {base_shock_off: "shock_nonscore_top25_prior", base_inter_off: "shock_x_blowout_top25_prior"},
    },
    {
        "variant": "shock_top25_fullseason",
        "rename_def": {base_shock_def: "shock_nonscore_top25_fullseason", base_inter_def: "shock_x_blowout_top25_fullseason"},
        "rename_off": {base_shock_off: "shock_nonscore_top25_fullseason", base_inter_off: "shock_x_blowout_top25_fullseason"},
    },
    {
        "variant": "vol_roll4_only",
        "rename_def": {base_vol_def: "vol_nonscore_roll4_prior"},
        "rename_off": {base_vol_off: "vol_nonscore_roll4_prior"},
    },
    {
        "variant": "vol_s2d_only",
        "rename_def": {base_vol_def: "vol_nonscore_s2d_prior"},
        "rename_off": {base_vol_off: "vol_nonscore_s2d_prior"},
    },
    {
        "variant": "vol_cv_s2d",
        "rename_def": {base_vol_def: "cv_nonscore_s2d_prior"},
        "rename_off": {base_vol_off: "cv_nonscore_s2d_prior"},
    },
    {
        "variant": "vol_cv_roll4",
        "rename_def": {base_vol_def: "cv_nonscore_roll4_prior"},
        "rename_off": {base_vol_off: "cv_nonscore_roll4_prior"},
    },
]

coef_rows = []
meta_rows = []

for v in robust_variants:
    tag = v["variant"]

    f_def = _rewrite_formula(base_formula_def, v["rename_def"])
    f_off = _rewrite_formula(base_formula_off, v["rename_off"])

    key_terms_def = []
    for t in [v["rename_def"].get(base_shock_def, base_shock_def), v["rename_def"].get(base_inter_def, base_inter_def), v["rename_def"].get(base_vol_def, base_vol_def), base_cum_def]:
        if t is not None and _infer_term_present(f_def, t):
            key_terms_def.append(t)
    for k in [1, 2, 3]:
        lag = f"ST_Shock_NonScore_w_minus_{k}"
        if _infer_term_present(f_def, lag):
            key_terms_def.append(lag)

    key_terms_off = []
    for t in [v["rename_off"].get(base_shock_off, base_shock_off), v["rename_off"].get(base_inter_off, base_inter_off), v["rename_off"].get(base_vol_off, base_vol_off), base_cum_off]:
        if t is not None and _infer_term_present(f_off, t):
            key_terms_off.append(t)
    for k in [1, 2, 3]:
        lag = f"ST_Shock_NonScore_w_minus_{k}"
        if _infer_term_present(f_off, lag):
            key_terms_off.append(lag)

    try:
        r_def = _fit_count(f_def, base_family_def, df21, TEAM_COL)
        tidy_def = _tidy_count(r_def, "preferred_count_def", "def", tag, base_family_def, key_terms_def)
        coef_rows.append(tidy_def)

        meta_rows.append(pd.DataFrame([{
            "side": "def",
            "spec_variant": tag,
            "family": base_family_def,
            "formula": f_def,
            "nobs": int(getattr(r_def, "nobs", np.nan)),
            "aic": float(getattr(r_def, "aic", np.nan)),
            "bic": float(getattr(r_def, "bic", np.nan)),
            "fit_ok": 1,
        }]))
    except Exception as e:
        meta_rows.append(pd.DataFrame([{
            "side": "def",
            "spec_variant": tag,
            "family": base_family_def,
            "formula": f_def,
            "fit_ok": 0,
            "fit_error": str(e),
        }]))

    try:
        r_off = _fit_count(f_off, base_family_off, df21, TEAM_COL)
        tidy_off = _tidy_count(r_off, "preferred_count_off", "off", tag, base_family_off, key_terms_off)
        coef_rows.append(tidy_off)

        meta_rows.append(pd.DataFrame([{
            "side": "off",
            "spec_variant": tag,
            "family": base_family_off,
            "formula": f_off,
            "nobs": int(getattr(r_off, "nobs", np.nan)),
            "aic": float(getattr(r_off, "aic", np.nan)),
            "bic": float(getattr(r_off, "bic", np.nan)),
            "fit_ok": 1,
        }]))
    except Exception as e:
        meta_rows.append(pd.DataFrame([{
            "side": "off",
            "spec_variant": tag,
            "family": base_family_off,
            "formula": f_off,
            "fit_ok": 0,
            "fit_error": str(e),
        }]))

coef_df = pd.concat(coef_rows, ignore_index=True) if coef_rows else pd.DataFrame()
meta_df = pd.concat(meta_rows, ignore_index=True) if meta_rows else pd.DataFrame()

con.register("step21_robust_coef_tmp", coef_df)
con.execute("CREATE OR REPLACE TABLE step21_robustness_coefficients AS SELECT * FROM step21_robust_coef_tmp")
con.unregister("step21_robust_coef_tmp")

con.register("step21_robust_meta_tmp", meta_df)
con.execute("CREATE OR REPLACE TABLE step21_robustness_meta AS SELECT * FROM step21_robust_meta_tmp")
con.unregister("step21_robust_meta_tmp")

coef_df.to_csv(out_dir / "step21_robustness_coefficients.csv", index=False)
meta_df.to_csv(out_dir / "step21_robustness_meta.csv", index=False)

print("wrote duckdb table step21_robustness_coefficients")
print("wrote duckdb table step21_robustness_meta")
print("wrote csv outputs step21_robustness_coefficients.csv and step21_robustness_meta.csv")

coef_df.query("is_key_term == 1").sort_values(["side", "spec_variant", "term"]).head(40)

wrote duckdb table step21_robustness_coefficients
wrote duckdb table step21_robustness_meta
wrote csv outputs step21_robustness_coefficients.csv and step21_robustness_meta.csv


Unnamed: 0,model,side,spec_variant,family,outcome,term,beta,se_cluster,pvalue,irr,irr_ci_lo,irr_ci_hi,is_key_term,nobs,aic,bic
249,preferred_count_def,def,baseline,poisson,Inj_Def_Next_w,cum_shocks_nonscore_prior,0.005955,0.01482,0.687817,1.005973,0.977172,1.035623,1,5950,20805.188499,22551.577735
246,preferred_count_def,def,baseline,poisson,Inj_Def_Next_w,shock_nonscore,0.023231,0.031875,0.466109,1.023503,0.961517,1.089485,1,5950,20805.188499,22551.577735
247,preferred_count_def,def,baseline,poisson,Inj_Def_Next_w,shock_x_blowout,0.034795,0.055765,0.532656,1.035407,0.928203,1.154993,1,5950,20805.188499,22551.577735
248,preferred_count_def,def,baseline,poisson,Inj_Def_Next_w,vol_nonscore_roll4_prior,-0.009422,0.007731,0.222963,0.990622,0.975725,1.005748,1,5950,20805.188499,22551.577735
1815,preferred_count_def,def,shock_top25_fullseason,poisson,Inj_Def_Next_w,cum_shocks_nonscore_prior,0.006049,0.014847,0.683681,1.006068,0.977213,1.035775,1,5950,20807.252381,22553.641618
1812,preferred_count_def,def,shock_top25_fullseason,poisson,Inj_Def_Next_w,shock_nonscore_top25_fullseason,0.008404,0.024889,0.735626,1.008439,0.960426,1.058853,1,5950,20807.252381,22553.641618
1813,preferred_count_def,def,shock_top25_fullseason,poisson,Inj_Def_Next_w,shock_x_blowout_top25_fullseason,-0.00867,0.046638,0.852531,0.991368,0.904765,1.08626,1,5950,20807.252381,22553.641618
1814,preferred_count_def,def,shock_top25_fullseason,poisson,Inj_Def_Next_w,vol_nonscore_roll4_prior,-0.010894,0.007554,0.149259,0.989165,0.974628,1.003919,1,5950,20807.252381,22553.641618
1293,preferred_count_def,def,shock_top25_prior,poisson,Inj_Def_Next_w,cum_shocks_nonscore_prior,0.006229,0.01484,0.674694,1.006248,0.977401,1.035946,1,5950,20806.521999,22552.911235
1290,preferred_count_def,def,shock_top25_prior,poisson,Inj_Def_Next_w,shock_nonscore_top25_prior,0.022908,0.027301,0.40143,1.023172,0.969861,1.079414,1,5950,20806.521999,22552.911235


We construct a weighted special teams workload from component counts, then refit the preferred models using weighted shock and volatility exposures when the components exist

In [10]:
MODEL_VIEW = "team_week_panel_nextweek_model"

mv_exists = con.execute(f"""
SELECT COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{MODEL_VIEW}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()
if int(mv_exists["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {MODEL_VIEW}, rerun notebook 11 and rebuild model view")

mv_cols = set(con.execute(f"DESCRIBE {MODEL_VIEW}").df()["column_name"].astype(str).tolist())

need_components = ["ST_Punt_w", "ST_PuntReturn_w", "ST_Kickoff_w", "ST_KickReturn_w", "ST_FGAtt_w", "ST_XPAtt_w"]
have_components = [c for c in need_components if c in mv_cols]

coef_schema_cols = [
    "model",
    "side",
    "spec_variant",
    "family",
    "outcome",
    "term",
    "beta",
    "se_cluster",
    "pvalue",
    "irr",
    "irr_ci_lo",
    "irr_ci_hi",
    "is_key_term",
    "nobs",
    "aic",
    "bic",
]
meta_schema_cols = [
    "side",
    "spec_variant",
    "family",
    "formula",
    "nobs",
    "aic",
    "bic",
    "fit_ok",
    "fit_error",
]

if len(have_components) != len(need_components):
    print("missing weighted components in model view, skipping weighted sensitivity", sorted(list(set(need_components) - set(have_components))))
    weighted_coef = pd.DataFrame(columns=coef_schema_cols)
    weighted_meta = pd.DataFrame(columns=meta_schema_cols)
else:
    join_keys = [SEASON_COL, WEEK_COL, TEAM_COL]
    sel_cols = ", ".join(join_keys + need_components)
    comp_df = con.execute(f"SELECT {sel_cols} FROM {MODEL_VIEW}").df()
    comp_df[TEAM_COL] = comp_df[TEAM_COL].astype(str)
    comp_df[SEASON_COL] = comp_df[SEASON_COL].astype(int)
    comp_df[WEEK_COL] = comp_df[WEEK_COL].astype(int)

    base = df21.copy()
    base = base.merge(comp_df, on=join_keys, how="left")

    for c in need_components:
        base[c] = base[c].fillna(0).astype(float)

    base["st_load_weighted"] = (
        1.5 * (base["ST_Punt_w"] + base["ST_PuntReturn_w"] + base["ST_KickReturn_w"])
        + 1.0 * base["ST_Kickoff_w"]
        + 0.5 * base["ST_FGAtt_w"]
        + 0.25 * base["ST_XPAtt_w"]
    ).astype(float)

    base = base.sort_values([TEAM_COL, SEASON_COL, WEEK_COL]).reset_index(drop=True)
    gg = base.groupby([TEAM_COL, SEASON_COL], sort=False)

    mean_w_prior = (
        gg["st_load_weighted"].apply(lambda s: s.expanding().mean().shift(1)).reset_index(level=[0, 1], drop=True)
    )
    sd_w_prior = (
        gg["st_load_weighted"].apply(lambda s: s.expanding().std(ddof=1).shift(1)).reset_index(level=[0, 1], drop=True)
    )
    z_w_prior = (base["st_load_weighted"] - mean_w_prior) / sd_w_prior

    base["shock_weighted"] = (z_w_prior >= 1).fillna(False).astype(int)
    base["shock_x_blowout_weighted"] = (base["shock_weighted"] * base["blowout_flag_w"].fillna(0).astype(int)).astype(int)

    base["vol_weighted_s2d_prior"] = sd_w_prior.fillna(0.0).astype(float)
    base["vol_weighted_roll4_prior"] = (
        gg["st_load_weighted"]
        .apply(lambda s: s.shift(1).rolling(4, min_periods=2).std(ddof=1))
        .reset_index(level=[0, 1], drop=True)
        .fillna(base["vol_weighted_s2d_prior"])
        .astype(float)
    )
    base["cum_shocks_weighted_prior"] = (
        gg["shock_weighted"].cumsum().shift(1).fillna(0).astype(int)
    )

    vol_term_def = base_vol_def
    vol_term_off = base_vol_off

    rename_weighted_def = {
        base_shock_def: "shock_weighted",
        base_inter_def: "shock_x_blowout_weighted",
        vol_term_def: "vol_weighted_roll4_prior",
        base_cum_def: "cum_shocks_weighted_prior",
    }
    rename_weighted_off = {
        base_shock_off: "shock_weighted",
        base_inter_off: "shock_x_blowout_weighted",
        vol_term_off: "vol_weighted_roll4_prior",
        base_cum_off: "cum_shocks_weighted_prior",
    }

    f_def_w = _rewrite_formula(base_formula_def, rename_weighted_def)
    f_off_w = _rewrite_formula(base_formula_off, rename_weighted_off)

    key_terms_w = [
        "shock_weighted",
        "shock_x_blowout_weighted",
        "vol_weighted_roll4_prior",
        "cum_shocks_weighted_prior",
    ]

    coef_rows = []
    meta_rows = []

    try:
        r_def_w = _fit_count(f_def_w, base_family_def, base, TEAM_COL)
        coef_rows.append(_tidy_count(r_def_w, "weighted_count_def", "def", "weighted_components", base_family_def, key_terms_w))
        meta_rows.append(pd.DataFrame([{
            "side": "def",
            "spec_variant": "weighted_components",
            "family": base_family_def,
            "formula": f_def_w,
            "nobs": int(getattr(r_def_w, "nobs", np.nan)),
            "aic": float(getattr(r_def_w, "aic", np.nan)),
            "bic": float(getattr(r_def_w, "bic", np.nan)),
            "fit_ok": 1,
            "fit_error": "",
        }]))
    except Exception as e:
        meta_rows.append(pd.DataFrame([{
            "side": "def",
            "spec_variant": "weighted_components",
            "family": base_family_def,
            "formula": f_def_w,
            "nobs": np.nan,
            "aic": np.nan,
            "bic": np.nan,
            "fit_ok": 0,
            "fit_error": str(e),
        }]))

    try:
        r_off_w = _fit_count(f_off_w, base_family_off, base, TEAM_COL)
        coef_rows.append(_tidy_count(r_off_w, "weighted_count_off", "off", "weighted_components", base_family_off, key_terms_w))
        meta_rows.append(pd.DataFrame([{
            "side": "off",
            "spec_variant": "weighted_components",
            "family": base_family_off,
            "formula": f_off_w,
            "nobs": int(getattr(r_off_w, "nobs", np.nan)),
            "aic": float(getattr(r_off_w, "aic", np.nan)),
            "bic": float(getattr(r_off_w, "bic", np.nan)),
            "fit_ok": 1,
            "fit_error": "",
        }]))
    except Exception as e:
        meta_rows.append(pd.DataFrame([{
            "side": "off",
            "spec_variant": "weighted_components",
            "family": base_family_off,
            "formula": f_off_w,
            "nobs": np.nan,
            "aic": np.nan,
            "bic": np.nan,
            "fit_ok": 0,
            "fit_error": str(e),
        }]))

    weighted_coef = pd.concat(coef_rows, ignore_index=True) if coef_rows else pd.DataFrame(columns=coef_schema_cols)
    weighted_meta = pd.concat(meta_rows, ignore_index=True) if meta_rows else pd.DataFrame(columns=meta_schema_cols)

    for c in coef_schema_cols:
        if c not in weighted_coef.columns:
            weighted_coef[c] = pd.Series(dtype="object")
    weighted_coef = weighted_coef[coef_schema_cols]

    for c in meta_schema_cols:
        if c not in weighted_meta.columns:
            weighted_meta[c] = pd.Series(dtype="object")
    weighted_meta = weighted_meta[meta_schema_cols]

if weighted_coef.shape[1] == 0:
    weighted_coef = pd.DataFrame(columns=coef_schema_cols)
if weighted_meta.shape[1] == 0:
    weighted_meta = pd.DataFrame(columns=meta_schema_cols)

con.register("step21_weighted_coef_tmp", weighted_coef)
con.execute("CREATE OR REPLACE TABLE step21_weighted_sensitivity_coefficients AS SELECT * FROM step21_weighted_coef_tmp")
con.unregister("step21_weighted_coef_tmp")

con.register("step21_weighted_meta_tmp", weighted_meta)
con.execute("CREATE OR REPLACE TABLE step21_weighted_sensitivity_meta AS SELECT * FROM step21_weighted_meta_tmp")
con.unregister("step21_weighted_meta_tmp")

weighted_coef.to_csv(out_dir / "step21_weighted_sensitivity_coefficients.csv", index=False)
weighted_meta.to_csv(out_dir / "step21_weighted_sensitivity_meta.csv", index=False)

print("wrote duckdb table step21_weighted_sensitivity_coefficients")
print("wrote duckdb table step21_weighted_sensitivity_meta")

missing weighted components in model view, skipping weighted sensitivity ['ST_FGAtt_w', 'ST_XPAtt_w']
wrote duckdb table step21_weighted_sensitivity_coefficients
wrote duckdb table step21_weighted_sensitivity_meta


We run timing validation and placebo regressions using matched right hand sides and lead based outcomes or lead based exposures

In [12]:
def _safe_exp(x):
    x = np.asarray(x, dtype=float)
    x = np.clip(x, -50.0, 50.0)
    return np.exp(x)

def _tidy_count(res, model_name: str, side: str, spec_variant: str, family: str, key_terms: list[str]) -> pd.DataFrame:
    params = res.params.astype(float)
    bse = res.bse.astype(float)
    pvals = res.pvalues.astype(float)

    beta = params.values.astype(float)
    se = bse.values.astype(float)

    irr = _safe_exp(beta)
    irr_lo = _safe_exp(beta - 1.96 * se)
    irr_hi = _safe_exp(beta + 1.96 * se)

    out = pd.DataFrame({
        "model": str(model_name),
        "side": str(side),
        "spec_variant": str(spec_variant),
        "family": str(family),
        "outcome": str(getattr(res.model, "endog_names", "")),
        "term": params.index.astype(str),
        "beta": beta,
        "se_cluster": se,
        "pvalue": pvals.values.astype(float),
        "irr": irr,
        "irr_ci_lo": irr_lo,
        "irr_ci_hi": irr_hi,
    })

    keep = set([t for t in key_terms if t is not None])
    out["is_key_term"] = out["term"].apply(lambda x: 1 if x in keep else 0)

    out["nobs"] = int(getattr(res, "nobs", np.nan))
    out["aic"] = float(getattr(res, "aic", np.nan))
    out["bic"] = float(getattr(res, "bic", np.nan))
    return out

def _replace_outcome(formula: str, new_outcome: str) -> str:
    rhs = str(formula).split("~", 1)[1].strip()
    return f"{new_outcome} ~ {rhs}"

def _drop_injury_controls(rhs_terms: list[str]) -> list[str]:
    out = []
    for t in rhs_terms:
        tt = str(t).strip()
        if tt.startswith("Inj_"):
            continue
        out.append(tt)
    return out

rhs_def = _get_rhs_terms(base_formula_def)
rhs_off = _get_rhs_terms(base_formula_off)

rhs_def_timing = _drop_injury_controls(rhs_def)
rhs_off_timing = _drop_injury_controls(rhs_off)

timing_formula_def_next = "Inj_Def_Next_w ~ " + " + ".join(rhs_def_timing)
timing_formula_def_prev = "Inj_Def_Prev_w ~ " + " + ".join(rhs_def_timing)
timing_formula_def_next2 = "Inj_Def_Next2_w ~ " + " + ".join(rhs_def_timing)

timing_formula_off_next = "Inj_Off_Next_w ~ " + " + ".join(rhs_off_timing)
timing_formula_off_prev = "Inj_Off_Prev_w ~ " + " + ".join(rhs_off_timing)
timing_formula_off_next2 = "Inj_Off_Next2_w ~ " + " + ".join(rhs_off_timing)

timing_rows = []
placebo_rows = []

def _fit_and_tidy_one(formula: str, family: str, side: str, tag: str, key_terms: list[str]):
    y = _get_outcome(formula)
    dat = df21.dropna(subset=[y]).copy()
    r = _fit_count(formula, family, dat, TEAM_COL)
    t = _tidy_count(r, f"timing_{side}", side, tag, family, key_terms)
    return t

key_terms_def = [t for t in [base_shock_def, base_inter_def, base_vol_def, base_cum_def] if t is not None]
key_terms_off = [t for t in [base_shock_off, base_inter_off, base_vol_off, base_cum_off] if t is not None]

try:
    t1 = _fit_and_tidy_one(timing_formula_def_next, base_family_def, "def", "timing_next_w", key_terms_def)
    timing_rows.append(t1)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "def", "test": "timing_next_w", "fit_ok": 0, "fit_error": str(e)}]))

try:
    t2 = _fit_and_tidy_one(timing_formula_def_prev, base_family_def, "def", "timing_prev_w", key_terms_def)
    timing_rows.append(t2)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "def", "test": "timing_prev_w", "fit_ok": 0, "fit_error": str(e)}]))

try:
    t3 = _fit_and_tidy_one(timing_formula_def_next2, base_family_def, "def", "placebo_next2_w", key_terms_def)
    timing_rows.append(t3)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "def", "test": "placebo_next2_w", "fit_ok": 0, "fit_error": str(e)}]))

try:
    o1 = _fit_and_tidy_one(timing_formula_off_next, base_family_off, "off", "timing_next_w", key_terms_off)
    timing_rows.append(o1)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "off", "test": "timing_next_w", "fit_ok": 0, "fit_error": str(e)}]))

try:
    o2 = _fit_and_tidy_one(timing_formula_off_prev, base_family_off, "off", "timing_prev_w", key_terms_off)
    timing_rows.append(o2)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "off", "test": "timing_prev_w", "fit_ok": 0, "fit_error": str(e)}]))

try:
    o3 = _fit_and_tidy_one(timing_formula_off_next2, base_family_off, "off", "placebo_next2_w", key_terms_off)
    timing_rows.append(o3)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "off", "test": "placebo_next2_w", "fit_ok": 0, "fit_error": str(e)}]))

def _future_exposure_placebo(base_formula: str, family: str, side: str, outcome: str, exp_terms: dict):
    f = _replace_outcome(base_formula, outcome)

    rename = {}
    for k in ["shock", "inter", "vol", "cum"]:
        t = exp_terms.get(k)
        if t is None:
            continue
        lead = f"{t}_lead1"
        if lead in df21.columns:
            rename[t] = lead

    f_lead = _rewrite_formula(f, rename)

    key_terms = list(rename.values())
    dat = df21.dropna(subset=[outcome]).copy()

    r = _fit_count(f_lead, family, dat, TEAM_COL)
    t = _tidy_count(r, f"placebo_futureexp_{side}", side, "current_on_futureexp_lead1", family, key_terms)
    t["placebo_outcome"] = outcome
    t["placebo_formula"] = f_lead
    return t

exp_terms_def = {"shock": base_shock_def, "inter": base_inter_def, "vol": base_vol_def, "cum": base_cum_def}
exp_terms_off = {"shock": base_shock_off, "inter": base_inter_off, "vol": base_vol_off, "cum": base_cum_off}

try:
    pdef = _future_exposure_placebo(base_formula_def, base_family_def, "def", "Inj_Def_Next_w", exp_terms_def)
    timing_rows.append(pdef)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "def", "test": "current_on_futureexp_lead1", "fit_ok": 0, "fit_error": str(e)}]))

try:
    poff = _future_exposure_placebo(base_formula_off, base_family_off, "off", "Inj_Off_Next_w", exp_terms_off)
    timing_rows.append(poff)
except Exception as e:
    placebo_rows.append(pd.DataFrame([{"side": "off", "test": "current_on_futureexp_lead1", "fit_ok": 0, "fit_error": str(e)}]))

coef_schema_cols = [
    "model","side","spec_variant","family","outcome","term","beta","se_cluster","pvalue",
    "irr","irr_ci_lo","irr_ci_hi","is_key_term","nobs","aic","bic","placebo_outcome","placebo_formula"
]
meta_schema_cols = ["side","test","fit_ok","fit_error"]

timing_coef = pd.concat(timing_rows, ignore_index=True) if timing_rows else pd.DataFrame(columns=coef_schema_cols)
placebo_meta = pd.concat(placebo_rows, ignore_index=True) if placebo_rows else pd.DataFrame(columns=meta_schema_cols)

for c in coef_schema_cols:
    if c not in timing_coef.columns:
        timing_coef[c] = np.nan
timing_coef = timing_coef[coef_schema_cols]

for c in meta_schema_cols:
    if c not in placebo_meta.columns:
        placebo_meta[c] = ""
placebo_meta = placebo_meta[meta_schema_cols]

con.register("step21_timing_coef_tmp", timing_coef)
con.execute("CREATE OR REPLACE TABLE step21_timing_placebo_coefficients AS SELECT * FROM step21_timing_coef_tmp")
con.unregister("step21_timing_coef_tmp")

con.register("step21_placebo_meta_tmp", placebo_meta)
con.execute("CREATE OR REPLACE TABLE step21_timing_placebo_meta AS SELECT * FROM step21_placebo_meta_tmp")
con.unregister("step21_placebo_meta_tmp")

timing_coef.to_csv(out_dir / "step21_timing_placebo_coefficients.csv", index=False)
placebo_meta.to_csv(out_dir / "step21_timing_placebo_meta.csv", index=False)

print("wrote duckdb table step21_timing_placebo_coefficients")
print("wrote duckdb table step21_timing_placebo_meta")

timing_coef.query("is_key_term == 1").sort_values(["side", "spec_variant", "term"]).head(40)

wrote duckdb table step21_timing_placebo_coefficients
wrote duckdb table step21_timing_placebo_meta


Unnamed: 0,model,side,spec_variant,family,outcome,term,beta,se_cluster,pvalue,irr,irr_ci_lo,irr_ci_hi,is_key_term,nobs,aic,bic,placebo_outcome,placebo_formula
741,timing_def,def,placebo_next2_w,poisson,Inj_Def_Next2_w,cum_shocks_nonscore_prior,0.007628,0.016451,0.642864,1.007657,0.975685,1.040677,1,5534,19427.51321,21055.705086,,
738,timing_def,def,placebo_next2_w,poisson,Inj_Def_Next2_w,shock_nonscore,0.019475,0.030723,0.526157,1.019665,0.960077,1.082952,1,5534,19427.51321,21055.705086,,
739,timing_def,def,placebo_next2_w,poisson,Inj_Def_Next2_w,shock_x_blowout,0.055789,0.05569,0.316448,1.057375,0.948036,1.179323,1,5534,19427.51321,21055.705086,,
740,timing_def,def,placebo_next2_w,poisson,Inj_Def_Next2_w,vol_nonscore_roll4_prior,0.000306,0.008018,0.969528,1.000306,0.984708,1.016151,1,5534,19427.51321,21055.705086,,
249,timing_def,def,timing_next_w,poisson,Inj_Def_Next_w,cum_shocks_nonscore_prior,0.006588,0.015167,0.664027,1.00661,0.977126,1.036983,1,5950,20846.878483,22579.885426,,
246,timing_def,def,timing_next_w,poisson,Inj_Def_Next_w,shock_nonscore,0.021201,0.031752,0.504312,1.021428,0.959798,1.087014,1,5950,20846.878483,22579.885426,,
247,timing_def,def,timing_next_w,poisson,Inj_Def_Next_w,shock_x_blowout,0.043472,0.054863,0.428141,1.044431,0.937949,1.163002,1,5950,20846.878483,22579.885426,,
248,timing_def,def,timing_next_w,poisson,Inj_Def_Next_w,vol_nonscore_roll4_prior,-0.008165,0.007973,0.305807,0.991868,0.976488,1.00749,1,5950,20846.878483,22579.885426,,
495,timing_def,def,timing_prev_w,poisson,Inj_Def_Prev_w,cum_shocks_nonscore_prior,0.020779,0.017523,0.235692,1.020996,0.986526,1.056671,1,5534,18197.6236,19825.815476,,
492,timing_def,def,timing_prev_w,poisson,Inj_Def_Prev_w,shock_nonscore,-0.056487,0.037669,0.133724,0.945078,0.877816,1.017495,1,5534,18197.6236,19825.815476,,


Quick sanity check to confirm that baseline and robustness runs produced usable key term rows for both sides with minimal fit failures

In [13]:
rob_meta = con.execute("SELECT * FROM step21_robustness_meta").df()
rob_coef = con.execute("SELECT * FROM step21_robustness_coefficients").df()

fail = rob_meta[rob_meta.get("fit_ok", 0) != 1] if "fit_ok" in rob_meta.columns else pd.DataFrame()
print("robustness fits", len(rob_meta), "failures", len(fail))
if len(fail):
    print(fail[["side", "spec_variant", "fit_error"]].head(20))

key = rob_coef[rob_coef.get("is_key_term", 0) == 1] if "is_key_term" in rob_coef.columns else pd.DataFrame()
print("key term rows", len(key))
print(key.groupby(["side", "spec_variant"], dropna=False).size().reset_index(name="n").sort_values(["side", "spec_variant"]).head(30))

robustness fits 16 failures 0
key term rows 64
   side            spec_variant  n
0   def                baseline  4
1   def  shock_top25_fullseason  4
2   def       shock_top25_prior  4
3   def               shock_z05  4
4   def            vol_cv_roll4  4
5   def              vol_cv_s2d  4
6   def          vol_roll4_only  4
7   def            vol_s2d_only  4
8   off                baseline  4
9   off  shock_top25_fullseason  4
10  off       shock_top25_prior  4
11  off               shock_z05  4
12  off            vol_cv_roll4  4
13  off              vol_cv_s2d  4
14  off          vol_roll4_only  4
15  off            vol_s2d_only  4
