We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path
import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb under the repo root")

con = duckdb.connect(str(DB_FILE))

SEASON_COL = "season"
WEEK_COL = "week"
TEAM_COL = "team"

MODEL_VIEW = "team_week_panel_nextweek_model"

existing = set(con.execute("SHOW TABLES").df()["name"].tolist())
if MODEL_VIEW not in existing:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to create the model view")

print("connected db", str(DB_FILE))
print("model view", MODEL_VIEW)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
model view team_week_panel_nextweek_model


Quick sanity check to confirm that 'has_next_week' is always 1 in the model view and that the view has unique season week team keys

In [3]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week,
  SUM(CASE WHEN Inj_Off_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_off
FROM {MODEL_VIEW}
""").df()
con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    team AS team_key,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
) d
""").df()

Unnamed: 0,dup_rows
0,0


Quick sanity check to confirm that all required columns exist. We normalize any naming differences so formulas are stable

In [4]:
cols = con.execute(f"DESCRIBE {MODEL_VIEW}").df()["column_name"].tolist()
cols_set = set(cols)

def pick_first_present(options):
    for o in options:
        if o in cols_set:
            return o
    return None

TEAM_COL = "team" if "team" in cols_set else ("team_key" if "team_key" in cols_set else None)
if TEAM_COL is None:
    raise RuntimeError("Missing team column, expected team or team_key in model view")

POINTS_FOR_COL = pick_first_present(["points_for_w", "points_for"])
POINTS_AGAINST_COL = pick_first_present(["points_against_w", "points_against"])
SCORE_DIFF_COL = pick_first_present(["score_diff_w", "score_diff"])
OFF_YPP_COL = pick_first_present(["off_yards_per_play_w", "Off_yards_per_play_w"])
CWI_COL = pick_first_present(["Cumulative_Workload_Index_w", "cumulative_workload_index_w"])

if POINTS_FOR_COL is None or POINTS_AGAINST_COL is None:
    raise RuntimeError("Missing points columns, expected points_for and points_against variants")

if SCORE_DIFF_COL is None:
    raise RuntimeError("Missing score diff column, expected score_diff_w")

if OFF_YPP_COL is None:
    raise RuntimeError("Missing offensive yards per play column, expected off_yards_per_play_w")

if CWI_COL is None:
    raise RuntimeError("Missing workload index column, expected Cumulative_Workload_Index_w")

OUTCOME_OFF = "Inj_Off_Next_w"
LAG_COLS = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]

SHOCK_COL_MAIN = "ST_Shock_NonScore_Roll_w" if "ST_Shock_NonScore_Roll_w" in cols_set else "ST_Shock_NonScore_w"

required = [
    SEASON_COL, WEEK_COL, TEAM_COL,
    OUTCOME_OFF,
    "Inj_Off_Last_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    OFF_YPP_COL,
    SCORE_DIFF_COL,
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    CWI_COL,
] + LAG_COLS

missing = [c for c in required if c not in cols_set]
if missing:
    raise RuntimeError("Missing required Step 15 columns in model view, " + ", ".join(missing))

print("team column", TEAM_COL)
print("shock column main", SHOCK_COL_MAIN)
print("points_for column", POINTS_FOR_COL)
print("points_against column", POINTS_AGAINST_COL)
print("score_diff column", SCORE_DIFF_COL)
print("off_ypp column", OFF_YPP_COL)
print("workload index column", CWI_COL)

team column team
shock column main ST_Shock_NonScore_w
points_for column points_for
points_against column points_against
score_diff column score_diff_w
off_ypp column off_yards_per_play_w
workload index column Cumulative_Workload_Index_w


We build the modeling frame from the model view, then construct shock variables and interactions, and finally drop rows only where truly required inputs are missing

In [5]:
select_cols = [
    SEASON_COL,
    WEEK_COL,
    TEAM_COL,
    OUTCOME_OFF,
    "Inj_Off_Last_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS + [
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    SCORE_DIFF_COL,
    OFF_YPP_COL,
    CWI_COL,
]

df = con.execute(f"SELECT {', '.join(select_cols)} FROM {MODEL_VIEW}").df()

rename_map = {}
if POINTS_FOR_COL != "points_for":
    rename_map[POINTS_FOR_COL] = "points_for"
if POINTS_AGAINST_COL != "points_against":
    rename_map[POINTS_AGAINST_COL] = "points_against"
if SCORE_DIFF_COL != "score_diff_w":
    rename_map[SCORE_DIFF_COL] = "score_diff_w"
if OFF_YPP_COL != "off_yards_per_play_w":
    rename_map[OFF_YPP_COL] = "off_yards_per_play_w"
if CWI_COL != "Cumulative_Workload_Index_w":
    rename_map[CWI_COL] = "Cumulative_Workload_Index_w"

df = df.rename(columns=rename_map)

df[TEAM_COL] = df[TEAM_COL].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df["season_week"] = (df[SEASON_COL] * 100 + df[WEEK_COL]).astype(int)

df["shock_nonscore"] = df[SHOCK_COL_MAIN].fillna(0).astype(int)
df["blowout_flag_w"] = df["blowout_flag_w"].fillna(0).astype(int)
df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"]).astype(int)

for c in LAG_COLS:
    df[c] = df[c].fillna(0).astype(int)

flag_cols = ["short_week_flag_w", "bye_last_week_flag_w", "home_flag_w"]
for c in flag_cols:
    df[c] = df[c].fillna(0).astype(int)

df["ST_Vol_NonScore_w"] = df["ST_Vol_NonScore_w"].fillna(0).astype(float)
df["Cum_Shocks_NonScore_w"] = df["Cum_Shocks_NonScore_w"].fillna(0).astype(float)

must_not_be_null = [
    OUTCOME_OFF,
    "Inj_Off_Last_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "points_for",
    "points_against",
    "score_diff_w",
    "off_yards_per_play_w",
    "Cumulative_Workload_Index_w",
]

before = len(df)
df = df.dropna(subset=must_not_be_null).reset_index(drop=True)
after = len(df)

print("rows before dropna", before)
print("rows after dropna", after)
df.head(3)

rows before dropna 5950
rows after dropna 5950


Unnamed: 0,season,week,team,Inj_Off_Next_w,Inj_Off_Last_w,blowout_flag_w,short_week_flag_w,bye_last_week_flag_w,home_flag_w,offensive_snaps_w,...,ST_Shock_NonScore_w_minus_2,ST_Shock_NonScore_w_minus_3,points_for,points_against,score_diff_w,off_yards_per_play_w,Cumulative_Workload_Index_w,season_week,shock_nonscore,shock_x_blowout
0,2012,1,ATL,2.0,0.0,1,0,0,0,55.0,...,0,0,40,24,16,6.836364,-3.940011,201201,0,0
1,2012,2,ATL,2.0,2.0,0,0,0,1,65.0,...,0,0,27,21,6,4.412698,-3.638251,201202,0,0
2,2012,3,ATL,2.0,2.0,1,0,0,0,69.0,...,0,0,27,3,24,5.565217,-2.938346,201203,0,0


Quick sanity check to confirm that the modeling frame has no nulls in key predictors and that core numeric fields look finite and usable

In [6]:
check_cols = [
    OUTCOME_OFF,
    "shock_nonscore",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "points_for",
    "points_against",
    "score_diff_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "off_yards_per_play_w",
    "Inj_Off_Last_w",
    "Cumulative_Workload_Index_w",
]

null_counts = {c: int(df[c].isna().sum()) for c in check_cols}
nonfinite = {
    "ST_Vol_NonScore_w_nonfinite": int((~np.isfinite(df["ST_Vol_NonScore_w"].astype(float))).sum()),
    "Cum_Shocks_NonScore_w_nonfinite": int((~np.isfinite(df["Cum_Shocks_NonScore_w"].astype(float))).sum()),
    "off_yards_per_play_w_nonfinite": int((~np.isfinite(df["off_yards_per_play_w"].astype(float))).sum()),
    "CWI_nonfinite": int((~np.isfinite(df["Cumulative_Workload_Index_w"].astype(float))).sum()),
}

print("null counts", null_counts)
print("nonfinite counts", nonfinite)
print("n rows", len(df))
print("n teams", df[TEAM_COL].nunique())

null counts {'Inj_Off_Next_w': 0, 'shock_nonscore': 0, 'ST_Vol_NonScore_w': 0, 'Cum_Shocks_NonScore_w': 0, 'points_for': 0, 'points_against': 0, 'score_diff_w': 0, 'offensive_snaps_w': 0, 'defensive_snaps_w': 0, 'off_yards_per_play_w': 0, 'Inj_Off_Last_w': 0, 'Cumulative_Workload_Index_w': 0}
nonfinite counts {'ST_Vol_NonScore_w_nonfinite': 0, 'Cum_Shocks_NonScore_w_nonfinite': 0, 'off_yards_per_play_w_nonfinite': 0, 'CWI_nonfinite': 0}
n rows 5950
n teams 35


We compute dispersion diagnostics for the offensive next week injury count so the Poisson baseline and Negative Binomial contingency choice is explicitly grounded in the data

In [7]:
def outcome_dispersion_stats(y: pd.Series) -> dict:
    y = y.astype(float)
    mean = float(y.mean())
    var = float(y.var(ddof=1))
    share_zero = float((y == 0).mean())
    return {
        "mean": mean,
        "var": var,
        "var_over_mean": (var / mean) if mean > 0 else np.nan,
        "share_zero": share_zero,
        "max": float(y.max()),
    }

stats_off = outcome_dispersion_stats(df[OUTCOME_OFF])

print("offense outcome", OUTCOME_OFF, stats_off)

material_overdispersion_off = (not np.isnan(stats_off["var_over_mean"])) and (stats_off["var_over_mean"] >= 1.5)
print("material overdispersion offense", bool(material_overdispersion_off))

offense outcome Inj_Off_Next_w {'mean': 1.9201680672268908, 'var': 2.1612169830110557, 'var_over_mean': 1.125535321568121, 'share_zero': 0.17714285714285713, 'max': 9.0}
material overdispersion offense False
