We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [2]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb or nflpa.duckdb near this notebook")

con = duckdb.connect(str(DB_FILE), read_only=False)

base_table = "step16_modeling_frame_nolookahead"

exists_df = con.execute(f"""
SELECT
  COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{base_table}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError("Missing step16_modeling_frame, run notebook 16 first")

print("connected db", str(DB_FILE))
print("base table", base_table)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
base table step16_modeling_frame_nolookahead


Quick sanity check to confirm that the binary outcomes have variation and that the predictors used in the logistic models have no missing values in the modeling frame

In [3]:
df = con.execute(f"SELECT * FROM {base_table}").df()

df["Any_Def_Injury_Next_w"] = (df["Inj_Def_Next_w"].astype(float) >= 1.0).astype(int)
df["Any_Off_Injury_Next_w"] = (df["Inj_Off_Next_w"].astype(float) >= 1.0).astype(int)

summary = pd.DataFrame({
    "n_rows": [len(df)],
    "def_rate": [float(df["Any_Def_Injury_Next_w"].mean())],
    "off_rate": [float(df["Any_Off_Injury_Next_w"].mean())],
    "def_any_ones": [int(df["Any_Def_Injury_Next_w"].sum())],
    "off_any_ones": [int(df["Any_Off_Injury_Next_w"].sum())],
})

missing_core = pd.DataFrame({
    "missing_ST_Vol_NonScore_w": [int(df["ST_Vol_NonScore_w"].isna().sum())],
    "missing_Cum_Shocks_NonScore_w": [int(df["Cum_Shocks_NonScore_w"].isna().sum())],
    "missing_points_for": [int(df["points_for"].isna().sum())],
    "missing_points_against": [int(df["points_against"].isna().sum())],
})

summary, missing_core

(   n_rows  def_rate  off_rate  def_any_ones  off_any_ones
 0    5950  0.844706  0.822857          5026          4896,
    missing_ST_Vol_NonScore_w  missing_Cum_Shocks_NonScore_w  \
 0                          0                              0   
 
    missing_points_for  missing_points_against  
 0                   0                       0  )

We fit logistic regression models for offense and defense using the same predictor blocks and fixed effects as the count models so that interpretation stays aligned

In [4]:
FE_TEAM = "C(team)"
FE_TIME = "C(season_week)"
cluster_groups = df["team"]

LAG_COLS = [c for c in ["ST_Shock_NonScore_w_minus_1", "ST_Shock_NonScore_w_minus_2", "ST_Shock_NonScore_w_minus_3"] if c in df.columns]

exposure_terms = [
    "shock_nonscore",
    "shock_x_blowout",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS

control_terms_base_def = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Def_Last_w",
    "Cumulative_Workload_Index_w",
]

control_terms_base_off = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Off_Last_w",
    "Cumulative_Workload_Index_w",
]

script_specs = [
    ("points_for_diff", ["points_for", "score_diff_w"]),
    ("points_against_diff", ["points_against", "score_diff_w"]),
    ("points_for_against", ["points_for", "points_against"]),
]

preferred_order = ["points_for_diff", "points_against_diff", "points_for_against"]

def build_formula(outcome: str, base_controls: list[str], script_terms: list[str]) -> str:
    rhs = exposure_terms + base_controls + script_terms + [FE_TEAM, FE_TIME]
    return outcome + " ~ " + " + ".join(rhs)

def fit_logit_glm(formula: str, data: pd.DataFrame, groups: pd.Series):
    m = smf.glm(formula=formula, data=data, family=sm.families.Binomial())
    r = m.fit(cov_type="cluster", cov_kwds={"groups": groups})
    return r

def fit_logit_grid(outcome: str, base_controls: list[str]) -> tuple[str, str, object]:
    fits = []
    for tag, script_terms in script_specs:
        f = build_formula(outcome, base_controls, script_terms)
        try:
            r = fit_logit_glm(f, df, cluster_groups)
        except Exception as e:
            print("logit failed", outcome, tag, str(e))
            continue
        fits.append((tag, f, r))
        print("fit ok", outcome, tag)

    if len(fits) == 0:
        raise RuntimeError(f"No logistic specifications fit successfully for {outcome}")

    fits_sorted = sorted(
        fits,
        key=lambda x: preferred_order.index(x[0]) if x[0] in preferred_order else 999
    )
    return fits_sorted[0]

spec_tag_def, formula_def_used, logit_def = fit_logit_grid("Any_Def_Injury_Next_w", control_terms_base_def)
spec_tag_off, formula_off_used, logit_off = fit_logit_grid("Any_Off_Injury_Next_w", control_terms_base_off)

print()
print("selected logistic spec defense", spec_tag_def)
print(formula_def_used)
print()
print("selected logistic spec offense", spec_tag_off)
print(formula_off_used)

fit ok Any_Def_Injury_Next_w points_for_diff
fit ok Any_Def_Injury_Next_w points_against_diff
fit ok Any_Def_Injury_Next_w points_for_against
fit ok Any_Off_Injury_Next_w points_for_diff
fit ok Any_Off_Injury_Next_w points_against_diff
fit ok Any_Off_Injury_Next_w points_for_against

selected logistic spec defense points_for_diff
Any_Def_Injury_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + offensive_snaps_w + defensive_snaps_w + blowout_flag_w + short_week_flag_w + bye_last_week_flag_w + home_flag_w + off_yards_per_play_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + points_for + score_diff_w + C(team) + C(season_week)

selected logistic spec offense points_for_diff
Any_Off_Injury_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3