We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [2]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db or notebook parent folders to contain it")

con = duckdb.connect(str(DB_FILE), read_only=False)

need_tables = ["step21_frame", "step18_preferred_model_specs"]
existing = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())
missing = [t for t in need_tables if t not in existing]

print("connected db", str(DB_FILE))
print("missing step 22 inputs", missing)
if missing:
    raise RuntimeError("Missing step 22 inputs, rerun step 21 and step 18 first")

df = con.execute("SELECT * FROM step21_frame").df()
pref = con.execute("SELECT * FROM step18_preferred_model_specs").df()

TEAM_COL = "team" if "team" in df.columns else ("team_key" if "team_key" in df.columns else None)
if TEAM_COL is None:
    raise RuntimeError("Missing team identifier column in step21_frame")

pref_def = pref[pref["side"].astype(str) == "def"].iloc[0]
pref_off = pref[pref["side"].astype(str) == "off"].iloc[0]

print("preferred def family", pref_def["family"])
print("preferred off family", pref_off["family"])
print("preferred def formula", pref_def["formula"])
print("preferred off formula", pref_off["formula"])

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
missing step 22 inputs []
preferred def family poisson
preferred off family poisson
preferred def formula Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + vol_nonscore_roll4_prior + cum_shocks_nonscore_prior + short_week_flag_w + bye_last_week_flag_w + home_flag_w + blowout_flag_w + points_for + points_against + offensive_snaps_w + defensive_snaps_w + Inj_Off_Last_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + C(team) + C(season_week)
preferred off formula Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + vol_nonscore_roll4_prior + cum_shocks_nonscore_prior + short_week_flag_w + bye_last_week_flag_w + home_flag_w + blowout_flag_w + points_for + points_against + offensive_snaps_w + defensive_snaps_w + Inj_Off_Last_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + C(team) + C(season_week)


We refit the preferred count models on the full sample so we have model objects available for expected injury changes and league scaling

In [3]:
def _fit_count(formula: str, family: str, data: pd.DataFrame, cluster_col: str):
    fam = str(family).lower()
    if fam == "poisson":
        m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
        r = m.fit(cov_type="cluster", cov_kwds={"groups": data[cluster_col]})
        return r
    if fam in ["negative_binomial", "negativebinomial", "nb"]:
        m = smf.negativebinomial(formula=formula, data=data)
        base = m.fit(disp=False, maxiter=200)
        r = base.get_robustcov_results(cov_type="cluster", groups=data[cluster_col])
        return r
    raise RuntimeError(f"Unknown family {family}")

def _infer_term_present(formula: str, term: str) -> bool:
    pat = rf"(?<![A-Za-z0-9_]){re.escape(term)}(?![A-Za-z0-9_])"
    return re.search(pat, str(formula)) is not None

def _detect_exposure_terms(formula: str) -> dict:
    shock = next((c for c in ["shock_nonscore", "ST_Shock_NonScore_w"] if _infer_term_present(formula, c)), None)
    inter = next((c for c in ["shock_x_blowout"] if _infer_term_present(formula, c)), None)
    vol = next((c for c in ["vol_nonscore_roll4_prior", "vol_nonscore_s2d_prior", "ST_Vol_NonScore_w"] if _infer_term_present(formula, c)), None)
    cum = next((c for c in ["cum_shocks_nonscore_prior", "Cum_Shocks_NonScore_w"] if _infer_term_present(formula, c)), None)
    return {"shock": shock, "inter": inter, "vol": vol, "cum": cum}

df[TEAM_COL] = df[TEAM_COL].astype(str)

f_def = str(pref_def["formula"])
f_off = str(pref_off["formula"])
fam_def = str(pref_def["family"])
fam_off = str(pref_off["family"])

exp_def = _detect_exposure_terms(f_def)
exp_off = _detect_exposure_terms(f_off)

for nm, dct in [("def", exp_def), ("off", exp_off)]:
    if dct["shock"] is None or dct["inter"] is None or dct["vol"] is None or dct["cum"] is None:
        raise RuntimeError(f"Could not detect exposures for {nm} formula, inspect step18_preferred_model_specs")

count_def = _fit_count(f_def, fam_def, df, TEAM_COL)
count_off = _fit_count(f_off, fam_off, df, TEAM_COL)

def _tidy_params(res, side: str) -> pd.DataFrame:
    out = pd.DataFrame({
        "side": side,
        "outcome": str(getattr(res.model, "endog_names", "")),
        "term": res.params.index.astype(str),
        "beta": res.params.values.astype(float),
        "se_cluster": res.bse.values.astype(float),
        "pvalue": res.pvalues.values.astype(float),
    })
    out["irr"] = np.exp(out["beta"].astype(float))
    return out

count_params = pd.concat([
    _tidy_params(count_def, "def"),
    _tidy_params(count_off, "off"),
], ignore_index=True)

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)

con.register("step22_count_params_tmp", count_params)
con.execute("CREATE OR REPLACE TABLE step22_count_model_parameters AS SELECT * FROM step22_count_params_tmp")
con.unregister("step22_count_params_tmp")

count_params.to_csv(out_dir / "step22_count_model_parameters.csv", index=False)

print("wrote duckdb table step22_count_model_parameters")
print("wrote csv outputs step22_count_model_parameters.csv")
count_params.head(20)

wrote duckdb table step22_count_model_parameters
wrote csv outputs step22_count_model_parameters.csv


Unnamed: 0,side,outcome,term,beta,se_cluster,pvalue,irr
0,def,Inj_Def_Next_w,Intercept,-0.050838,0.220906,0.8179884,0.950433
1,def,Inj_Def_Next_w,C(team)[T.ATL],0.069043,0.015141,5.117431e-06,1.071482
2,def,Inj_Def_Next_w,C(team)[T.BAL],-0.045573,0.012824,0.0003797802,0.95545
3,def,Inj_Def_Next_w,C(team)[T.BUF],0.228725,0.016741,1.69652e-42,1.256997
4,def,Inj_Def_Next_w,C(team)[T.CAR],0.326101,0.017479,1.104785e-77,1.385555
5,def,Inj_Def_Next_w,C(team)[T.CHI],0.199739,0.011166,1.4618740000000001e-71,1.221084
6,def,Inj_Def_Next_w,C(team)[T.CIN],0.399726,0.016813,6.04515e-125,1.491416
7,def,Inj_Def_Next_w,C(team)[T.CLE],0.076197,0.012041,2.477563e-10,1.079175
8,def,Inj_Def_Next_w,C(team)[T.DAL],0.223739,0.011322,6.34207e-87,1.250745
9,def,Inj_Def_Next_w,C(team)[T.DEN],0.26587,0.011833,8.501905e-112,1.304565
