We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [5]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db or notebook parent folders to contain it")

con = duckdb.connect(str(DB_FILE), read_only=False)

need_tables = [
    "step18_model_frame",
    "step18_preferred_model_specs",
]

existing = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())
missing = [t for t in need_tables if t not in existing]

print("connected db", str(DB_FILE))
print("missing step 21 inputs", missing)
if missing:
    raise RuntimeError("Missing step 21 inputs, rerun step 18 first")


connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
missing step 21 inputs []


We load the model frame and preferred specs then standardize key column names and verifies required outcomes are present

In [2]:
df = con.execute("SELECT * FROM step18_model_frame").df()
pref = con.execute("SELECT * FROM step18_preferred_model_specs").df()

TEAM_COL = "team" if "team" in df.columns else ("team_key" if "team_key" in df.columns else None)
if TEAM_COL is None:
    raise RuntimeError("Missing team identifier column in step18_model_frame")

SEASON_COL = "season"
WEEK_COL = "week"

req_outcomes = ["Inj_Off_Next_w", "Inj_Def_Next_w"]
for c in req_outcomes:
    if c not in df.columns:
        raise RuntimeError(f"Missing {c} in step18_model_frame, rerun step 16 and step 18")

if "season_week" not in df.columns:
    df["season_week"] = (df[SEASON_COL].astype(int) * 100 + df[WEEK_COL].astype(int)).astype(int)

if "load_nonscore" not in df.columns:
    raise RuntimeError("Missing load_nonscore in step18_model_frame, rerun step 18")

if "shock_nonscore" not in df.columns:
    raise RuntimeError("Missing shock_nonscore in step18_model_frame, rerun step 18")

if "blowout_flag_w" not in df.columns:
    raise RuntimeError("Missing blowout_flag_w in step18_model_frame, rerun step 10 onward and rebuild step 16 then step 18")

if "shock_x_blowout" not in df.columns:
    df["shock_x_blowout"] = (df["shock_nonscore"].astype(int) * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

if "Any_Off_Injury_Next_w" not in df.columns:
    df["Any_Off_Injury_Next_w"] = (df["Inj_Off_Next_w"].astype(float) > 0).astype(int)

if "Any_Def_Injury_Next_w" not in df.columns:
    df["Any_Def_Injury_Next_w"] = (df["Inj_Def_Next_w"].astype(float) > 0).astype(int)

print("rows step18_model_frame", len(df))
print("team col", TEAM_COL)
print("preferred specs rows", len(pref))
print(pref[["side", "outcome", "family", "spec_id"]])

rows step18_model_frame 5950
team col team
preferred specs rows 2
  side         outcome   family                 spec_id
0  def  Inj_Def_Next_w  poisson  nonscore_roll4_no_lags
1  off  Inj_Off_Next_w  poisson  nonscore_roll4_no_lags


We construct all robustness variants in a no lookahead safe way where possible and then create lead and lag variables needed for timing and placebo tests

In [3]:
df = df.copy()
df[TEAM_COL] = df[TEAM_COL].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df = df.sort_values([TEAM_COL, SEASON_COL, WEEK_COL]).reset_index(drop=True)
g = df.groupby([TEAM_COL, SEASON_COL], sort=False)

mean_s2d_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().mean().shift(1))
    .reset_index(level=[0, 1], drop=True)
)
sd_s2d_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().std(ddof=1).shift(1))
    .reset_index(level=[0, 1], drop=True)
)

z_s2d_prior = (df["load_nonscore"] - mean_s2d_prior) / sd_s2d_prior

df["shock_nonscore_z05"] = (z_s2d_prior >= 0.5).fillna(False).astype(int)
df["shock_x_blowout_z05"] = (df["shock_nonscore_z05"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

q75_full = g["load_nonscore"].transform(lambda s: float(s.quantile(0.75)) if len(s) else np.nan)
df["shock_nonscore_top25_fullseason"] = (df["load_nonscore"] >= q75_full).fillna(False).astype(int)
df["shock_x_blowout_top25_fullseason"] = (df["shock_nonscore_top25_fullseason"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

q75_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().quantile(0.75).shift(1))
    .reset_index(level=[0, 1], drop=True)
)
df["shock_nonscore_top25_prior"] = (df["load_nonscore"] >= q75_prior).fillna(False).astype(int)
df["shock_x_blowout_top25_prior"] = (df["shock_nonscore_top25_prior"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

if "vol_nonscore_s2d_prior" not in df.columns or "vol_nonscore_roll4_prior" not in df.columns:
    raise RuntimeError("Missing vol_nonscore_s2d_prior or vol_nonscore_roll4_prior in step18_model_frame, rerun step 18")

df["cv_nonscore_s2d_prior"] = (df["vol_nonscore_s2d_prior"].astype(float) / mean_s2d_prior.astype(float)).replace([np.inf, -np.inf], np.nan).fillna(0.0).astype(float)

mean_roll4_prior = (
    g["load_nonscore"]
    .apply(lambda s: s.shift(1).rolling(4, min_periods=2).mean())
    .reset_index(level=[0, 1], drop=True)
).fillna(mean_s2d_prior)

df["cv_nonscore_roll4_prior"] = (df["vol_nonscore_roll4_prior"].astype(float) / mean_roll4_prior.astype(float)).replace([np.inf, -np.inf], np.nan).fillna(0.0).astype(float)

df["Inj_Off_Next2_w"] = g["Inj_Off_Next_w"].shift(-1)
df["Inj_Def_Next2_w"] = g["Inj_Def_Next_w"].shift(-1)

if "Inj_Off_Last_w" in df.columns:
    df["Inj_Off_Prev_w"] = g["Inj_Off_Last_w"].shift(1)
else:
    df["Inj_Off_Prev_w"] = g["Inj_Off_Next_w"].shift(1)

if "Inj_Def_Last_w" in df.columns:
    df["Inj_Def_Prev_w"] = g["Inj_Def_Last_w"].shift(1)
else:
    df["Inj_Def_Prev_w"] = g["Inj_Def_Next_w"].shift(1)

lead_exposure_cols = [
    "shock_nonscore",
    "shock_x_blowout",
    "vol_nonscore_s2d_prior",
    "vol_nonscore_roll4_prior",
    "cum_shocks_nonscore_prior",
]
for c in lead_exposure_cols:
    if c in df.columns:
        df[f"{c}_lead1"] = g[c].shift(-1)
        df[f"{c}_lead2"] = g[c].shift(-2)

con.register("step21_frame_tmp", df)
con.execute("CREATE OR REPLACE TABLE step21_frame AS SELECT * FROM step21_frame_tmp")
con.unregister("step21_frame_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)
df.to_csv(out_dir / "step21_frame.csv", index=False)

print("wrote duckdb table step21_frame")
print("wrote csv outputs step21_frame.csv")

wrote duckdb table step21_frame
wrote csv outputs step21_frame.csv


Quick sanity check to confirm that the new robustness fields exist, have plausible ranges, and that lead outcomes are only missing at season edges

In [4]:
check_cols = [
    "shock_nonscore_z05",
    "shock_nonscore_top25_fullseason",
    "shock_nonscore_top25_prior",
    "cv_nonscore_s2d_prior",
    "cv_nonscore_roll4_prior",
    "Inj_Off_Next2_w",
    "Inj_Def_Next2_w",
    "Inj_Off_Prev_w",
    "Inj_Def_Prev_w",
]
missing = [c for c in check_cols if c not in df.columns]
print("missing check cols", missing)
if missing:
    raise RuntimeError("Missing step 21 derived columns, rerun step 21 cell 3")

summary = pd.DataFrame({
    "col": check_cols,
    "nulls": [int(df[c].isna().sum()) for c in check_cols],
    "min": [float(np.nanmin(df[c].astype(float))) for c in check_cols],
    "max": [float(np.nanmax(df[c].astype(float))) for c in check_cols],
    "mean": [float(np.nanmean(df[c].astype(float))) for c in check_cols],
})
print(summary)

edge_missing = df.groupby([SEASON_COL, TEAM_COL], sort=False)[["Inj_Off_Next2_w", "Inj_Def_Next2_w"]].apply(lambda x: int(x.isna().sum().sum()))
print("total lead2 missing per team season, expected small and concentrated at end weeks")
print(edge_missing.describe())

missing check cols []
                               col  nulls  min        max      mean
0               shock_nonscore_z05      0  0.0   1.000000  0.272269
1  shock_nonscore_top25_fullseason      0  0.0   1.000000  0.355126
2       shock_nonscore_top25_prior      0  0.0   1.000000  0.365882
3            cv_nonscore_s2d_prior      0  0.0   0.633958  0.168143
4          cv_nonscore_roll4_prior      0  0.0   0.633958  0.164322
5                  Inj_Off_Next2_w    416  0.0   8.000000  1.927720
6                  Inj_Def_Next2_w    416  0.0  10.000000  2.088182
7                   Inj_Off_Prev_w    416  0.0   9.000000  1.637152
8                   Inj_Def_Prev_w    416  0.0  10.000000  1.780990
total lead2 missing per team season, expected small and concentrated at end weeks
count    416.0
mean       2.0
std        0.0
min        2.0
25%        2.0
50%        2.0
75%        2.0
max        2.0
dtype: float64


We define reusable helpers to rewrite formulas, fit count models with clustered standard errors, and extract tidy coefficient outputs for baseline and robustness variants

In [7]:
def _rewrite_formula(formula: str, rename_map: dict) -> str:
    out = str(formula)
    for old, new in rename_map.items():
        if old == new:
            continue
        pat = rf"(?<![A-Za-z0-9_]){re.escape(old)}(?![A-Za-z0-9_])"
        out = re.sub(pat, new, out)
    return out

def _get_outcome(formula: str) -> str:
    return str(formula).split("~", 1)[0].strip()

def _get_rhs_terms(formula: str) -> list[str]:
    rhs = str(formula).split("~", 1)[1]
    parts = [p.strip() for p in rhs.split("+")]
    return [p for p in parts if p]

def _fit_count(formula: str, family: str, data: pd.DataFrame, cluster_col: str):
    fam = str(family).lower()
    if fam == "poisson":
        m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
        r = m.fit(cov_type="cluster", cov_kwds={"groups": data[cluster_col]})
        return r
    if fam in ["negative_binomial", "negativebinomial", "nb"]:
        m = smf.negativebinomial(formula=formula, data=data)
        base = m.fit(disp=False, maxiter=200)
        r = base.get_robustcov_results(cov_type="cluster", groups=data[cluster_col])
        return r
    raise RuntimeError(f"Unknown family {family}")

def _tidy_count(res, model_name: str, side: str, spec_variant: str, family: str, key_terms: list[str]) -> pd.DataFrame:
    params = res.params.copy()
    bse = res.bse.copy()
    pvals = res.pvalues.copy()

    out = pd.DataFrame({
        "model": str(model_name),
        "side": str(side),
        "spec_variant": str(spec_variant),
        "family": str(family),
        "outcome": str(getattr(res.model, "endog_names", "")),
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": bse.values.astype(float),
        "pvalue": pvals.values.astype(float),
    })

    out["irr"] = np.exp(out["beta"].astype(float))
    out["irr_ci_lo"] = np.exp(out["beta"].astype(float) - 1.96 * out["se_cluster"].astype(float))
    out["irr_ci_hi"] = np.exp(out["beta"].astype(float) + 1.96 * out["se_cluster"].astype(float))

    keep = set(key_terms)
    out["is_key_term"] = out["term"].apply(lambda x: 1 if x in keep else 0)

    out["nobs"] = int(getattr(res, "nobs", np.nan))
    out["aic"] = float(getattr(res, "aic", np.nan))
    out["bic"] = float(getattr(res, "bic", np.nan))
    return out

def _infer_term_present(formula: str, term: str) -> bool:
    pat = rf"(?<![A-Za-z0-9_]){re.escape(term)}(?![A-Za-z0-9_])"
    return re.search(pat, str(formula)) is not None