We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [2]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import re
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import PerfectSeparationError

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db or notebook parent folders to contain it")

con = duckdb.connect(str(DB_FILE), read_only=False)

need_tables = ["step21_frame", "step18_preferred_model_specs"]
existing = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())
missing = [t for t in need_tables if t not in existing]

print("connected db", str(DB_FILE))
print("missing step 22 inputs", missing)
if missing:
    raise RuntimeError("Missing step 22 inputs, rerun step 21 and step 18 first")

df = con.execute("SELECT * FROM step21_frame").df()
pref = con.execute("SELECT * FROM step18_preferred_model_specs").df()

TEAM_COL = "team" if "team" in df.columns else ("team_key" if "team_key" in df.columns else None)
if TEAM_COL is None:
    raise RuntimeError("Missing team identifier column in step21_frame")

pref_def = pref[pref["side"].astype(str) == "def"].iloc[0]
pref_off = pref[pref["side"].astype(str) == "off"].iloc[0]

print("preferred def family", pref_def["family"])
print("preferred off family", pref_off["family"])
print("preferred def formula", pref_def["formula"])
print("preferred off formula", pref_off["formula"])

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
missing step 22 inputs []
preferred def family poisson
preferred off family poisson
preferred def formula Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + vol_nonscore_roll4_prior + cum_shocks_nonscore_prior + short_week_flag_w + bye_last_week_flag_w + home_flag_w + blowout_flag_w + points_for + points_against + offensive_snaps_w + defensive_snaps_w + Inj_Off_Last_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + C(team) + C(season_week)
preferred off formula Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + vol_nonscore_roll4_prior + cum_shocks_nonscore_prior + short_week_flag_w + bye_last_week_flag_w + home_flag_w + blowout_flag_w + points_for + points_against + offensive_snaps_w + defensive_snaps_w + Inj_Off_Last_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + C(team) + C(season_week)


We refit the preferred count models on the full sample so we have model objects available for expected injury changes and league scaling

In [3]:
def _fit_count(formula: str, family: str, data: pd.DataFrame, cluster_col: str):
    fam = str(family).lower()
    if fam == "poisson":
        m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
        r = m.fit(cov_type="cluster", cov_kwds={"groups": data[cluster_col]})
        return r
    if fam in ["negative_binomial", "negativebinomial", "nb"]:
        m = smf.negativebinomial(formula=formula, data=data)
        base = m.fit(disp=False, maxiter=200)
        r = base.get_robustcov_results(cov_type="cluster", groups=data[cluster_col])
        return r
    raise RuntimeError(f"Unknown family {family}")

def _infer_term_present(formula: str, term: str) -> bool:
    pat = rf"(?<![A-Za-z0-9_]){re.escape(term)}(?![A-Za-z0-9_])"
    return re.search(pat, str(formula)) is not None

def _detect_exposure_terms(formula: str) -> dict:
    shock = next((c for c in ["shock_nonscore", "ST_Shock_NonScore_w"] if _infer_term_present(formula, c)), None)
    inter = next((c for c in ["shock_x_blowout"] if _infer_term_present(formula, c)), None)
    vol = next((c for c in ["vol_nonscore_roll4_prior", "vol_nonscore_s2d_prior", "ST_Vol_NonScore_w"] if _infer_term_present(formula, c)), None)
    cum = next((c for c in ["cum_shocks_nonscore_prior", "Cum_Shocks_NonScore_w"] if _infer_term_present(formula, c)), None)
    return {"shock": shock, "inter": inter, "vol": vol, "cum": cum}

df[TEAM_COL] = df[TEAM_COL].astype(str)

f_def = str(pref_def["formula"])
f_off = str(pref_off["formula"])
fam_def = str(pref_def["family"])
fam_off = str(pref_off["family"])

exp_def = _detect_exposure_terms(f_def)
exp_off = _detect_exposure_terms(f_off)

for nm, dct in [("def", exp_def), ("off", exp_off)]:
    if dct["shock"] is None or dct["inter"] is None or dct["vol"] is None or dct["cum"] is None:
        raise RuntimeError(f"Could not detect exposures for {nm} formula, inspect step18_preferred_model_specs")

count_def = _fit_count(f_def, fam_def, df, TEAM_COL)
count_off = _fit_count(f_off, fam_off, df, TEAM_COL)

def _tidy_params(res, side: str) -> pd.DataFrame:
    out = pd.DataFrame({
        "side": side,
        "outcome": str(getattr(res.model, "endog_names", "")),
        "term": res.params.index.astype(str),
        "beta": res.params.values.astype(float),
        "se_cluster": res.bse.values.astype(float),
        "pvalue": res.pvalues.values.astype(float),
    })
    out["irr"] = np.exp(out["beta"].astype(float))
    return out

count_params = pd.concat([
    _tidy_params(count_def, "def"),
    _tidy_params(count_off, "off"),
], ignore_index=True)

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)

con.register("step22_count_params_tmp", count_params)
con.execute("CREATE OR REPLACE TABLE step22_count_model_parameters AS SELECT * FROM step22_count_params_tmp")
con.unregister("step22_count_params_tmp")

count_params.to_csv(out_dir / "step22_count_model_parameters.csv", index=False)

print("wrote duckdb table step22_count_model_parameters")
print("wrote csv outputs step22_count_model_parameters.csv")
count_params.head(20)

wrote duckdb table step22_count_model_parameters
wrote csv outputs step22_count_model_parameters.csv


Unnamed: 0,side,outcome,term,beta,se_cluster,pvalue,irr
0,def,Inj_Def_Next_w,Intercept,-0.050838,0.220906,0.8179884,0.950433
1,def,Inj_Def_Next_w,C(team)[T.ATL],0.069043,0.015141,5.117431e-06,1.071482
2,def,Inj_Def_Next_w,C(team)[T.BAL],-0.045573,0.012824,0.0003797802,0.95545
3,def,Inj_Def_Next_w,C(team)[T.BUF],0.228725,0.016741,1.69652e-42,1.256997
4,def,Inj_Def_Next_w,C(team)[T.CAR],0.326101,0.017479,1.104785e-77,1.385555
5,def,Inj_Def_Next_w,C(team)[T.CHI],0.199739,0.011166,1.4618740000000001e-71,1.221084
6,def,Inj_Def_Next_w,C(team)[T.CIN],0.399726,0.016813,6.04515e-125,1.491416
7,def,Inj_Def_Next_w,C(team)[T.CLE],0.076197,0.012041,2.477563e-10,1.079175
8,def,Inj_Def_Next_w,C(team)[T.DAL],0.223739,0.011322,6.34207e-87,1.250745
9,def,Inj_Def_Next_w,C(team)[T.DEN],0.26587,0.011833,8.501905e-112,1.304565


We translate coefficients into expected injury changes using average marginal effects under shock switches, volatility percentile moves, and cumulative shock increments

In [4]:
def _ame_shock_count(res, data: pd.DataFrame, shock_col: str, inter_col: str, blowout_col: str) -> dict:
    d0 = data.copy()
    d1 = data.copy()

    d0[shock_col] = 0
    d0[inter_col] = 0

    d1[shock_col] = 1
    d1[inter_col] = d1[blowout_col].fillna(0).astype(int)

    mu0 = np.asarray(res.predict(d0), dtype=float)
    mu1 = np.asarray(res.predict(d1), dtype=float)

    return {
        "mu0_mean": float(np.nanmean(mu0)),
        "mu1_mean": float(np.nanmean(mu1)),
        "ame": float(np.nanmean(mu1 - mu0)),
        "multiplier": float(np.nanmean(mu1 / np.clip(mu0, 1e-12, None))),
    }

def _ame_percentile_count(res, data: pd.DataFrame, var_col: str, p_lo: float = 0.25, p_hi: float = 0.75) -> dict:
    lo = float(np.nanquantile(data[var_col].astype(float), p_lo))
    hi = float(np.nanquantile(data[var_col].astype(float), p_hi))

    d0 = data.copy()
    d1 = data.copy()
    d0[var_col] = lo
    d1[var_col] = hi

    mu0 = np.asarray(res.predict(d0), dtype=float)
    mu1 = np.asarray(res.predict(d1), dtype=float)

    return {
        "p_lo": lo,
        "p_hi": hi,
        "mu_lo_mean": float(np.nanmean(mu0)),
        "mu_hi_mean": float(np.nanmean(mu1)),
        "ame": float(np.nanmean(mu1 - mu0)),
        "multiplier": float(np.nanmean(mu1 / np.clip(mu0, 1e-12, None))),
    }

def _ame_plus_one_count(res, data: pd.DataFrame, var_col: str) -> dict:
    d0 = data.copy()
    d1 = data.copy()
    base = d0[var_col].astype(float).fillna(0.0)
    d1[var_col] = base + 1.0

    mu0 = np.asarray(res.predict(d0), dtype=float)
    mu1 = np.asarray(res.predict(d1), dtype=float)

    return {
        "mu0_mean": float(np.nanmean(mu0)),
        "mu1_mean": float(np.nanmean(mu1)),
        "ame": float(np.nanmean(mu1 - mu0)),
        "multiplier": float(np.nanmean(mu1 / np.clip(mu0, 1e-12, None))),
    }

BLOWOUT_COL = "blowout_flag_w"
if BLOWOUT_COL not in df.columns:
    raise RuntimeError("Missing blowout_flag_w in step21_frame")

rows = []

def _add_side(side: str, res, exp: dict):
    shock_col = exp["shock"]
    inter_col = exp["inter"]
    vol_col = exp["vol"]
    cum_col = exp["cum"]

    r_shock = _ame_shock_count(res, df, shock_col, inter_col, BLOWOUT_COL)
    r_vol = _ame_percentile_count(res, df, vol_col, 0.25, 0.75)
    r_cum = _ame_plus_one_count(res, df, cum_col)

    rows.append({
        "side": side,
        "effect": "shock_0_to_1",
        "var": shock_col,
        "ame": r_shock["ame"],
        "multiplier": r_shock["multiplier"],
        "baseline_mean": r_shock["mu0_mean"],
        "alt_mean": r_shock["mu1_mean"],
    })
    rows.append({
        "side": side,
        "effect": "vol_25_to_75",
        "var": vol_col,
        "p25": r_vol["p_lo"],
        "p75": r_vol["p_hi"],
        "ame": r_vol["ame"],
        "multiplier": r_vol["multiplier"],
        "baseline_mean": r_vol["mu_lo_mean"],
        "alt_mean": r_vol["mu_hi_mean"],
    })
    rows.append({
        "side": side,
        "effect": "cum_plus_1",
        "var": cum_col,
        "ame": r_cum["ame"],
        "multiplier": r_cum["multiplier"],
        "baseline_mean": r_cum["mu0_mean"],
        "alt_mean": r_cum["mu1_mean"],
    })

_add_side("def", count_def, exp_def)
_add_side("off", count_off, exp_off)

count_effects = pd.DataFrame(rows)

con.register("step22_count_effects_tmp", count_effects)
con.execute("CREATE OR REPLACE TABLE step22_count_effect_translations AS SELECT * FROM step22_count_effects_tmp")
con.unregister("step22_count_effects_tmp")

count_effects.to_csv(out_dir / "step22_count_effect_translations.csv", index=False)

print("wrote duckdb table step22_count_effect_translations")
print("wrote csv outputs step22_count_effect_translations.csv")
count_effects

wrote duckdb table step22_count_effect_translations
wrote csv outputs step22_count_effect_translations.csv


Unnamed: 0,side,effect,var,ame,multiplier,baseline_mean,alt_mean,p25,p75
0,def,shock_0_to_1,shock_nonscore,0.073524,1.036019,2.071458,2.144982,,
1,def,vol_25_to_75,vol_nonscore_roll4_prior,-0.045539,0.97841,2.10921,2.063671,1.0,3.316625
2,def,cum_plus_1,cum_shocks_nonscore_prior,0.012447,1.005973,2.083866,2.096312,,
3,off,shock_0_to_1,shock_nonscore,0.113743,1.060039,1.901461,2.015204,,
4,off,vol_25_to_75,vol_nonscore_roll4_prior,0.103057,1.055311,1.863239,1.966296,1.0,3.316625
5,off,cum_plus_1,cum_shocks_nonscore_prior,-0.019694,0.989743,1.920168,1.900474,,


We fit logistic models using the same preferred right hand sides so 22.2 can report probability changes under the same exposure definitions

In [9]:
import warnings
from statsmodels.tools.sm_exceptions import PerfectSeparationError, ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

DATA = df21 if "df21" in globals() else df

BASE_DEF = base_formula_def if "base_formula_def" in globals() else f_def
BASE_OFF = base_formula_off if "base_formula_off" in globals() else f_off

def _safe_exp(x):
    x = np.asarray(x, dtype=float)
    x = np.clip(x, -50.0, 50.0)
    return np.exp(x)

def _swap_outcome(formula: str, new_outcome: str) -> str:
    rhs = str(formula).split("~", 1)[1].strip()
    return f"{new_outcome} ~ {rhs}"

def _rhs_terms_simple(formula: str) -> list[str]:
    rhs = str(formula).split("~", 1)[1]
    parts = [p.strip() for p in rhs.split("+")]
    return [p for p in parts if p]

def _fix_time_fe_for_logit(formula: str, data: pd.DataFrame, ycol: str) -> tuple[str, dict]:
    rhs_terms = _rhs_terms_simple(formula)
    info = {"patched": 0, "no_var_groups": np.nan, "time_term": ""}

    if "C(season_week)" in rhs_terms:
        info["time_term"] = "C(season_week)"
        no_var = int((data.groupby("season_week")[ycol].nunique() == 1).sum())
        info["no_var_groups"] = no_var

        if no_var > 0:
            rhs_terms = [t for t in rhs_terms if t != "C(season_week)"]
            if "C(season)" not in rhs_terms:
                rhs_terms.append("C(season)")
            if "C(week)" not in rhs_terms:
                rhs_terms.append("C(week)")
            info["patched"] = 1

    rhs = " + ".join(rhs_terms)
    out = f"{ycol} ~ {rhs}"
    return out, info

def _fit_logit_cluster(formula: str, data: pd.DataFrame, cluster_col: str):
    m = smf.logit(formula=formula, data=data)
    try:
        r = m.fit(
            disp=False,
            maxiter=200,
            cov_type="cluster",
            cov_kwds={"groups": data[cluster_col]},
        )
        return r, "logit"
    except TypeError:
        base = m.fit(disp=False, maxiter=200)
        raise AttributeError("logit cluster cov not supported in this statsmodels build") from None

def _fit_glm_binom_cluster(formula: str, data: pd.DataFrame, cluster_col: str):
    m = smf.glm(formula=formula, data=data, family=sm.families.Binomial())
    r = m.fit(cov_type="cluster", cov_kwds={"groups": data[cluster_col]})
    return r, "glm_binomial"

def _fit_glm_binom_ridge(formula: str, data: pd.DataFrame):
    m = smf.glm(formula=formula, data=data, family=sm.families.Binomial())
    r = m.fit_regularized(alpha=1.0, L1_wt=0.0, maxiter=200)
    return r, "glm_binomial_ridge"

def _fit_binary_stable(formula: str, data: pd.DataFrame, cluster_col: str):
    try:
        return _fit_logit_cluster(formula, data, cluster_col)
    except (np.linalg.LinAlgError, PerfectSeparationError, AttributeError) as e1:
        try:
            return _fit_glm_binom_cluster(formula, data, cluster_col)
        except Exception as e2:
            r, tag = _fit_glm_binom_ridge(formula, data)
            return r, tag

def _params_series(res) -> pd.Series:
    p = getattr(res, "params", None)
    if p is None:
        return pd.Series(dtype=float)
    if isinstance(p, pd.Series):
        return p.astype(float)
    names = None
    try:
        names = list(getattr(getattr(res, "model", None), "exog_names", []))
    except Exception:
        names = None
    if names and len(names) == len(np.asarray(p)):
        return pd.Series(np.asarray(p, dtype=float), index=[str(x) for x in names])
    return pd.Series(np.asarray(p, dtype=float), index=[f"param_{i}" for i in range(len(np.asarray(p)))])

def _safe_attr_vec(res, attr: str, n: int):
    v = getattr(res, attr, None)
    if v is None:
        return np.full(n, np.nan, dtype=float)
    try:
        a = np.asarray(v, dtype=float)
        if a.shape[0] != n:
            return np.full(n, np.nan, dtype=float)
        return a
    except Exception:
        return np.full(n, np.nan, dtype=float)

def _tidy_logit_like(res, side: str, fit_kind: str) -> pd.DataFrame:
    params = _params_series(res)
    n = len(params)

    bse = _safe_attr_vec(res, "bse", n)
    pvals = _safe_attr_vec(res, "pvalues", n)

    beta = params.values.astype(float)
    out = pd.DataFrame({
        "side": str(side),
        "fit_kind": str(fit_kind),
        "outcome": str(getattr(getattr(res, "model", None), "endog_names", "")),
        "term": params.index.astype(str),
        "beta": beta,
        "se_cluster": bse,
        "pvalue": pvals,
    })

    out["odds_ratio"] = _safe_exp(out["beta"].astype(float))
    out["odds_ratio_ci_lo"] = _safe_exp(out["beta"].astype(float) - 1.96 * out["se_cluster"].astype(float))
    out["odds_ratio_ci_hi"] = _safe_exp(out["beta"].astype(float) + 1.96 * out["se_cluster"].astype(float))
    return out

if "Any_Def_Injury_Next_w" not in DATA.columns:
    DATA["Any_Def_Injury_Next_w"] = (DATA["Inj_Def_Next_w"].astype(float) > 0).astype(int)
if "Any_Off_Injury_Next_w" not in DATA.columns:
    DATA["Any_Off_Injury_Next_w"] = (DATA["Inj_Off_Next_w"].astype(float) > 0).astype(int)

logit_formula_def_raw = _swap_outcome(BASE_DEF, "Any_Def_Injury_Next_w")
logit_formula_off_raw = _swap_outcome(BASE_OFF, "Any_Off_Injury_Next_w")

logit_formula_def, info_def = _fix_time_fe_for_logit(logit_formula_def_raw, DATA, "Any_Def_Injury_Next_w")
logit_formula_off, info_off = _fix_time_fe_for_logit(logit_formula_off_raw, DATA, "Any_Off_Injury_Next_w")

print("logit time fe patch def", info_def)
print("logit time fe patch off", info_off)

logit_def, kind_def = _fit_binary_stable(logit_formula_def, DATA, TEAM_COL)
logit_off, kind_off = _fit_binary_stable(logit_formula_off, DATA, TEAM_COL)

logit_params = pd.concat([
    _tidy_logit_like(logit_def, "def", kind_def),
    _tidy_logit_like(logit_off, "off", kind_off),
], ignore_index=True)

con.register("step22_logit_params_tmp", logit_params)
con.execute("CREATE OR REPLACE TABLE step22_logit_model_parameters AS SELECT * FROM step22_logit_params_tmp")
con.unregister("step22_logit_params_tmp")

logit_params.to_csv(out_dir / "step22_logit_model_parameters.csv", index=False)

print("wrote duckdb table step22_logit_model_parameters")
print("wrote csv outputs step22_logit_model_parameters.csv")
print("fit kinds", logit_params.groupby(["side", "fit_kind"]).size().reset_index(name="n"))
logit_params.head(20)

logit time fe patch def {'patched': 1, 'no_var_groups': 8, 'time_term': 'C(season_week)'}
logit time fe patch off {'patched': 1, 'no_var_groups': 4, 'time_term': 'C(season_week)'}
wrote duckdb table step22_logit_model_parameters
wrote csv outputs step22_logit_model_parameters.csv
fit kinds   side fit_kind   n
0  def    logit  78
1  off    logit  78


Unnamed: 0,side,fit_kind,outcome,term,beta,se_cluster,pvalue,odds_ratio,odds_ratio_ci_lo,odds_ratio_ci_hi
0,def,logit,Any_Def_Injury_Next_w,Intercept,-0.409729,0.74278,0.581212,0.66383,0.154807,2.846584
1,def,logit,Any_Def_Injury_Next_w,C(team)[T.ATL],0.790488,0.063283,8.322325999999999e-36,2.204472,1.947319,2.495583
2,def,logit,Any_Def_Injury_Next_w,C(team)[T.BAL],0.008252,0.045102,0.8548346,1.008286,0.922979,1.101477
3,def,logit,Any_Def_Injury_Next_w,C(team)[T.BUF],1.385532,0.077672,3.571354e-71,3.996952,3.432519,4.654198
4,def,logit,Any_Def_Injury_Next_w,C(team)[T.CAR],1.556035,0.069939,1.171315e-109,4.739988,4.132792,5.436393
5,def,logit,Any_Def_Injury_Next_w,C(team)[T.CHI],1.080452,0.052855,7.113605e-93,2.946011,2.656092,3.267575
6,def,logit,Any_Def_Injury_Next_w,C(team)[T.CIN],2.785314,0.079405,1.502248e-269,16.204903,13.869324,18.933792
7,def,logit,Any_Def_Injury_Next_w,C(team)[T.CLE],0.001963,0.060547,0.9741304,1.001965,0.889844,1.128214
8,def,logit,Any_Def_Injury_Next_w,C(team)[T.DAL],1.139545,0.050738,1.0366589999999999e-111,3.125345,2.829494,3.452129
9,def,logit,Any_Def_Injury_Next_w,C(team)[T.DEN],1.266218,0.045002,3.458576e-174,3.547412,3.247919,3.874521


We translate logit coefficients into probability changes using average marginal effects for shock switches, volatility percentile moves, and cumulative shock increments

In [10]:
def _ame_shock_logit(res, data: pd.DataFrame, shock_col: str, inter_col: str, blowout_col: str) -> dict:
    d0 = data.copy()
    d1 = data.copy()

    d0[shock_col] = 0
    d0[inter_col] = 0

    d1[shock_col] = 1
    d1[inter_col] = d1[blowout_col].fillna(0).astype(int)

    p0 = np.asarray(res.predict(d0), dtype=float)
    p1 = np.asarray(res.predict(d1), dtype=float)

    return {
        "p0_mean": float(np.nanmean(p0)),
        "p1_mean": float(np.nanmean(p1)),
        "ame": float(np.nanmean(p1 - p0)),
    }

def _ame_percentile_logit(res, data: pd.DataFrame, var_col: str, p_lo: float = 0.25, p_hi: float = 0.75) -> dict:
    lo = float(np.nanquantile(data[var_col].astype(float), p_lo))
    hi = float(np.nanquantile(data[var_col].astype(float), p_hi))

    d0 = data.copy()
    d1 = data.copy()
    d0[var_col] = lo
    d1[var_col] = hi

    p0 = np.asarray(res.predict(d0), dtype=float)
    p1 = np.asarray(res.predict(d1), dtype=float)

    return {
        "p_lo": lo,
        "p_hi": hi,
        "p0_mean": float(np.nanmean(p0)),
        "p1_mean": float(np.nanmean(p1)),
        "ame": float(np.nanmean(p1 - p0)),
    }

def _ame_plus_one_logit(res, data: pd.DataFrame, var_col: str) -> dict:
    d0 = data.copy()
    d1 = data.copy()
    base = d0[var_col].astype(float).fillna(0.0)
    d1[var_col] = base + 1.0

    p0 = np.asarray(res.predict(d0), dtype=float)
    p1 = np.asarray(res.predict(d1), dtype=float)

    return {
        "p0_mean": float(np.nanmean(p0)),
        "p1_mean": float(np.nanmean(p1)),
        "ame": float(np.nanmean(p1 - p0)),
    }

rows = []

def _add_logit(side: str, res, exp: dict):
    shock_col = exp["shock"]
    inter_col = exp["inter"]
    vol_col = exp["vol"]
    cum_col = exp["cum"]

    r_shock = _ame_shock_logit(res, df, shock_col, inter_col, BLOWOUT_COL)
    r_vol = _ame_percentile_logit(res, df, vol_col, 0.25, 0.75)
    r_cum = _ame_plus_one_logit(res, df, cum_col)

    rows.append({
        "side": side,
        "effect": "shock_0_to_1",
        "var": shock_col,
        "ame_prob": r_shock["ame"],
        "p0_mean": r_shock["p0_mean"],
        "p1_mean": r_shock["p1_mean"],
    })
    rows.append({
        "side": side,
        "effect": "vol_25_to_75",
        "var": vol_col,
        "p25": r_vol["p_lo"],
        "p75": r_vol["p_hi"],
        "ame_prob": r_vol["ame"],
        "p0_mean": r_vol["p0_mean"],
        "p1_mean": r_vol["p1_mean"],
    })
    rows.append({
        "side": side,
        "effect": "cum_plus_1",
        "var": cum_col,
        "ame_prob": r_cum["ame"],
        "p0_mean": r_cum["p0_mean"],
        "p1_mean": r_cum["p1_mean"],
    })

_add_logit("def", logit_def, exp_def)
_add_logit("off", logit_off, exp_off)

logit_effects = pd.DataFrame(rows)

con.register("step22_logit_effects_tmp", logit_effects)
con.execute("CREATE OR REPLACE TABLE step22_logit_effect_translations AS SELECT * FROM step22_logit_effects_tmp")
con.unregister("step22_logit_effects_tmp")

logit_effects.to_csv(out_dir / "step22_logit_effect_translations.csv", index=False)

print("wrote duckdb table step22_logit_effect_translations")
print("wrote csv outputs step22_logit_effect_translations.csv")
logit_effects

wrote duckdb table step22_logit_effect_translations
wrote csv outputs step22_logit_effect_translations.csv


Unnamed: 0,side,effect,var,ame_prob,p0_mean,p1_mean,p25,p75
0,def,shock_0_to_1,shock_nonscore,0.013877,0.842578,0.856455,,
1,def,vol_25_to_75,vol_nonscore_roll4_prior,-0.014182,0.852457,0.838275,1.0,3.316625
2,def,cum_plus_1,cum_shocks_nonscore_prior,0.00114,0.844706,0.845846,,
3,off,shock_0_to_1,shock_nonscore,0.025176,0.818592,0.843768,,
4,off,vol_25_to_75,vol_nonscore_roll4_prior,0.024628,0.808949,0.833577,1.0,3.316625
5,off,cum_plus_1,cum_shocks_nonscore_prior,-0.004526,0.822857,0.818331,,


We scale per team week effects to season totals using both observed shock frequency and full league team week counts

In [11]:
SEASON_COL = "season"

if SEASON_COL not in df.columns:
    raise RuntimeError("Missing season in step21_frame")

teamweeks_by_season = df.groupby(SEASON_COL, sort=False).size().reset_index(name="n_teamweeks")
shockweeks_by_season_def = df.groupby(SEASON_COL, sort=False)[exp_def["shock"]].sum().reset_index(name="n_shock_teamweeks_def")
shockweeks_by_season_off = df.groupby(SEASON_COL, sort=False)[exp_off["shock"]].sum().reset_index(name="n_shock_teamweeks_off")

n_teamweeks_typical = float(teamweeks_by_season["n_teamweeks"].median())
print("typical team weeks per season in sample", n_teamweeks_typical)

count_eff = con.execute("SELECT * FROM step22_count_effect_translations").df()

def_shock_ame = float(count_eff[(count_eff["side"] == "def") & (count_eff["effect"] == "shock_0_to_1")]["ame"].iloc[0])
off_shock_ame = float(count_eff[(count_eff["side"] == "off") & (count_eff["effect"] == "shock_0_to_1")]["ame"].iloc[0])

def_vol_ame = float(count_eff[(count_eff["side"] == "def") & (count_eff["effect"] == "vol_25_to_75")]["ame"].iloc[0])
off_vol_ame = float(count_eff[(count_eff["side"] == "off") & (count_eff["effect"] == "vol_25_to_75")]["ame"].iloc[0])

def_cum_ame = float(count_eff[(count_eff["side"] == "def") & (count_eff["effect"] == "cum_plus_1")]["ame"].iloc[0])
off_cum_ame = float(count_eff[(count_eff["side"] == "off") & (count_eff["effect"] == "cum_plus_1")]["ame"].iloc[0])

league_scale = teamweeks_by_season.merge(shockweeks_by_season_def, on=SEASON_COL, how="left").merge(shockweeks_by_season_off, on=SEASON_COL, how="left")
league_scale["n_shock_teamweeks_def"] = league_scale["n_shock_teamweeks_def"].fillna(0.0)
league_scale["n_shock_teamweeks_off"] = league_scale["n_shock_teamweeks_off"].fillna(0.0)

league_scale["extra_def_inj_from_shocks_observed"] = league_scale["n_shock_teamweeks_def"].astype(float) * def_shock_ame
league_scale["extra_off_inj_from_shocks_observed"] = league_scale["n_shock_teamweeks_off"].astype(float) * off_shock_ame

league_scale["extra_def_inj_if_all_teamweeks_shock"] = league_scale["n_teamweeks"].astype(float) * def_shock_ame
league_scale["extra_off_inj_if_all_teamweeks_shock"] = league_scale["n_teamweeks"].astype(float) * off_shock_ame

league_scale["extra_def_inj_vol_p25_to_p75_per_teamweek"] = def_vol_ame
league_scale["extra_off_inj_vol_p25_to_p75_per_teamweek"] = off_vol_ame
league_scale["extra_def_inj_vol_p25_to_p75_if_all_teamweeks"] = league_scale["n_teamweeks"].astype(float) * def_vol_ame
league_scale["extra_off_inj_vol_p25_to_p75_if_all_teamweeks"] = league_scale["n_teamweeks"].astype(float) * off_vol_ame

league_scale["extra_def_inj_cum_plus_1_if_all_teamweeks"] = league_scale["n_teamweeks"].astype(float) * def_cum_ame
league_scale["extra_off_inj_cum_plus_1_if_all_teamweeks"] = league_scale["n_teamweeks"].astype(float) * off_cum_ame

con.register("step22_league_scale_tmp", league_scale)
con.execute("CREATE OR REPLACE TABLE step22_league_level_scaling AS SELECT * FROM step22_league_scale_tmp")
con.unregister("step22_league_scale_tmp")

league_scale.to_csv(out_dir / "step22_league_level_scaling.csv", index=False)

print("wrote duckdb table step22_league_level_scaling")
print("wrote csv outputs step22_league_level_scaling.csv")
league_scale

typical team weeks per season in sample 448.0
wrote duckdb table step22_league_level_scaling
wrote csv outputs step22_league_level_scaling.csv


Unnamed: 0,season,n_teamweeks,n_shock_teamweeks_def,n_shock_teamweeks_off,extra_def_inj_from_shocks_observed,extra_off_inj_from_shocks_observed,extra_def_inj_if_all_teamweeks_shock,extra_off_inj_if_all_teamweeks_shock,extra_def_inj_vol_p25_to_p75_per_teamweek,extra_off_inj_vol_p25_to_p75_per_teamweek,extra_def_inj_vol_p25_to_p75_if_all_teamweeks,extra_off_inj_vol_p25_to_p75_if_all_teamweeks,extra_def_inj_cum_plus_1_if_all_teamweeks,extra_off_inj_cum_plus_1_if_all_teamweeks
0,2012,448,75,75,5.514274,8.530713,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
1,2013,448,69,69,5.073132,7.848256,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
2,2014,448,77,77,5.661322,8.758199,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
3,2015,448,84,84,6.175987,9.554399,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
4,2016,448,76,76,5.587798,8.644456,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
5,2017,450,72,72,5.293703,8.189485,33.085646,51.184279,-0.045539,0.103057,-20.49246,46.375528,5.600952,-8.862421
6,2018,448,65,65,4.779038,7.393285,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
7,2019,448,70,70,5.146656,7.961999,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
8,2020,448,92,92,6.764177,10.464342,32.938599,50.956794,-0.045539,0.103057,-20.401383,46.169414,5.576059,-8.823033
9,2021,480,76,76,5.587798,8.644456,35.291356,54.596565,-0.045539,0.103057,-21.858624,49.467229,5.974349,-9.45325


We produce a single compact translation table that reports per team week effects and observed league season totals side by side for offense and defense

In [12]:
count_eff = con.execute("SELECT * FROM step22_count_effect_translations").df()
logit_eff = con.execute("SELECT * FROM step22_logit_effect_translations").df()
league_scale = con.execute("SELECT * FROM step22_league_level_scaling").df()

avg_league = league_scale.drop(columns=["season"]).mean(numeric_only=True).to_dict()

rows = []

for side in ["def", "off"]:
    ce_shock = count_eff[(count_eff["side"] == side) & (count_eff["effect"] == "shock_0_to_1")].iloc[0].to_dict()
    ce_vol = count_eff[(count_eff["side"] == side) & (count_eff["effect"] == "vol_25_to_75")].iloc[0].to_dict()
    ce_cum = count_eff[(count_eff["side"] == side) & (count_eff["effect"] == "cum_plus_1")].iloc[0].to_dict()

    le_shock = logit_eff[(logit_eff["side"] == side) & (logit_eff["effect"] == "shock_0_to_1")].iloc[0].to_dict()
    le_vol = logit_eff[(logit_eff["side"] == side) & (logit_eff["effect"] == "vol_25_to_75")].iloc[0].to_dict()
    le_cum = logit_eff[(logit_eff["side"] == side) & (logit_eff["effect"] == "cum_plus_1")].iloc[0].to_dict()

    if side == "def":
        obs_league_shock = float(avg_league.get("extra_def_inj_from_shocks_observed", np.nan))
    else:
        obs_league_shock = float(avg_league.get("extra_off_inj_from_shocks_observed", np.nan))

    rows.append({
        "side": side,
        "count_shock_ame_inj_per_teamweek_when_shock": float(ce_shock["ame"]),
        "count_vol_ame_inj_per_teamweek_p25_to_p75": float(ce_vol["ame"]),
        "count_cum_plus1_ame_inj_per_teamweek": float(ce_cum["ame"]),
        "logit_shock_ame_prob_points": float(le_shock["ame_prob"]),
        "logit_vol_ame_prob_points": float(le_vol["ame_prob"]),
        "logit_cum_plus1_ame_prob_points": float(le_cum["ame_prob"]),
        "league_extra_injuries_per_season_from_shocks_observed": obs_league_shock,
    })

translation = pd.DataFrame(rows)

con.register("step22_translation_tmp", translation)
con.execute("CREATE OR REPLACE TABLE step22_translation_summary AS SELECT * FROM step22_translation_tmp")
con.unregister("step22_translation_tmp")

translation.to_csv(out_dir / "step22_translation_summary.csv", index=False)

print("wrote duckdb table step22_translation_summary")
print("wrote csv outputs step22_translation_summary.csv")
translation

wrote duckdb table step22_translation_summary
wrote csv outputs step22_translation_summary.csv


Unnamed: 0,side,count_shock_ame_inj_per_teamweek_when_shock,count_vol_ame_inj_per_teamweek_p25_to_p75,count_cum_plus1_ame_inj_per_teamweek,logit_shock_ame_prob_points,logit_vol_ame_prob_points,logit_cum_plus1_ame_prob_points,league_extra_injuries_per_season_from_shocks_observed
0,def,0.073524,-0.045539,0.012447,0.013877,-0.014182,0.00114,5.695256
1,off,0.113743,0.103057,-0.019694,0.025176,0.024628,-0.004526,8.810696
