We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb or nflpa.duckdb near this notebook")

con = duckdb.connect(str(DB_FILE), read_only=False)

MODEL_VIEW = "team_week_panel_nextweek_model"

exists_df = con.execute(f"""
SELECT
  COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{MODEL_VIEW}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to create the model view")

print("connected db", str(DB_FILE))
print("model view", MODEL_VIEW)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
model view team_week_panel_nextweek_model


Quick sanity check to confirm that the modeling view only contains next week eligible rows, has non null offensive and defensive outcomes, and has unique team week keys

In [3]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week,
  SUM(CASE WHEN Inj_Def_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_def,
  SUM(CASE WHEN Inj_Off_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_off
FROM {MODEL_VIEW}
""").df()

desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
cols = set(desc["column_name"].astype(str).tolist())

if "team" in cols:
    TEAM_COL = "team"
elif "team_key" in cols:
    TEAM_COL = "team_key"
else:
    raise RuntimeError(f"No team id column found in {MODEL_VIEW}, expected team or team_key")

con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_any,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
)
""").df()

Unnamed: 0,dup_rows
0,0


We build a single, clean modeling frame table with robust column detection, consistent names, and consistent missing value handling

In [4]:
def pick_first_present(candidates: list[str], present: set[str]) -> str | None:
    for c in candidates:
        if c in present:
            return c
    return None

desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
present_cols = set(desc["column_name"].astype(str).tolist())

SEASON_COL = pick_first_present(["season"], present_cols)
WEEK_COL = pick_first_present(["week"], present_cols)
TEAM_RAW_COL = pick_first_present(["team", "team_key"], present_cols)

if SEASON_COL is None or WEEK_COL is None:
    raise RuntimeError("Missing season or week columns in model view")
if TEAM_RAW_COL is None:
    raise RuntimeError("Missing team column, expected team or team_key in model view")

OUTCOME_DEF = "Inj_Def_Next_w"
OUTCOME_OFF = "Inj_Off_Next_w"
if OUTCOME_DEF not in present_cols or OUTCOME_OFF not in present_cols:
    raise RuntimeError("Missing Inj_Def_Next_w or Inj_Off_Next_w in model view")

LAG_CANDIDATES = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]
LAG_COLS = [c for c in LAG_CANDIDATES if c in present_cols]

SHOCK_COL_MAIN = pick_first_present(
    ["ST_Shock_NonScore_w", "st_shock_nonscore_w", "shock_nonscore"],
    present_cols
)

POINTS_FOR_COL = pick_first_present(["points_for_w", "points_for"], present_cols)
POINTS_AGAINST_COL = pick_first_present(["points_against_w", "points_against"], present_cols)
SCORE_DIFF_COL = pick_first_present(["score_diff_w", "score_diff"], present_cols)
OFF_YPP_COL = pick_first_present(["off_yards_per_play_w", "Off_yards_per_play_w"], present_cols)
CWI_COL = pick_first_present(["Cumulative_Workload_Index_w", "cumulative_workload_index_w"], present_cols)

required_core = [
    SHOCK_COL_MAIN,
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    SCORE_DIFF_COL,
    OFF_YPP_COL,
    CWI_COL,
]
if any(c is None for c in required_core):
    raise RuntimeError("Missing one or more required columns for steps 16 and 17, check your step 10 and step 11 outputs")

select_cols = [
    SEASON_COL,
    WEEK_COL,
    TEAM_RAW_COL,
    OUTCOME_DEF,
    OUTCOME_OFF,
    "Inj_Def_Last_w",
    "Inj_Off_Last_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS + [
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    SCORE_DIFF_COL,
    OFF_YPP_COL,
    CWI_COL,
]

df = con.execute(f"SELECT {', '.join(select_cols)} FROM {MODEL_VIEW}").df()

rename_map = {}

if TEAM_RAW_COL != "team":
    rename_map[TEAM_RAW_COL] = "team"
if POINTS_FOR_COL != "points_for":
    rename_map[POINTS_FOR_COL] = "points_for"
if POINTS_AGAINST_COL != "points_against":
    rename_map[POINTS_AGAINST_COL] = "points_against"
if SCORE_DIFF_COL != "score_diff_w":
    rename_map[SCORE_DIFF_COL] = "score_diff_w"
if OFF_YPP_COL != "off_yards_per_play_w":
    rename_map[OFF_YPP_COL] = "off_yards_per_play_w"
if CWI_COL != "Cumulative_Workload_Index_w":
    rename_map[CWI_COL] = "Cumulative_Workload_Index_w"
if SHOCK_COL_MAIN != "ST_Shock_NonScore_w":
    rename_map[SHOCK_COL_MAIN] = "ST_Shock_NonScore_w"

df = df.rename(columns=rename_map)

df["team"] = df["team"].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df["season_week"] = (df[SEASON_COL] * 100 + df[WEEK_COL]).astype(int)

df["blowout_flag_w"] = df["blowout_flag_w"].fillna(0).astype(int)
df["short_week_flag_w"] = df["short_week_flag_w"].fillna(0).astype(int)
df["bye_last_week_flag_w"] = df["bye_last_week_flag_w"].fillna(0).astype(int)
df["home_flag_w"] = df["home_flag_w"].fillna(0).astype(int)

df["Inj_Def_Last_w"] = df["Inj_Def_Last_w"].fillna(0).astype(float)
df["Inj_Off_Last_w"] = df["Inj_Off_Last_w"].fillna(0).astype(float)

df["ST_Shock_NonScore_w"] = df["ST_Shock_NonScore_w"].fillna(0).astype(int)
df["shock_nonscore"] = df["ST_Shock_NonScore_w"].astype(int)
df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"]).astype(int)

for c in LAG_COLS:
    df[c] = df[c].fillna(0).astype(int)

must_not_be_null = [
    OUTCOME_DEF,
    OUTCOME_OFF,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "points_for",
    "points_against",
    "score_diff_w",
    "off_yards_per_play_w",
    "Cumulative_Workload_Index_w",
]

before = len(df)
df = df.dropna(subset=must_not_be_null).reset_index(drop=True)
after = len(df)

print("rows before dropna", before)
print("rows after dropna", after)

con.register("step16_modeling_frame_tmp", df)
con.execute("CREATE OR REPLACE TABLE step16_modeling_frame AS SELECT * FROM step16_modeling_frame_tmp")
con.unregister("step16_modeling_frame_tmp")

print("wrote duckdb table step16_modeling_frame")
df.head(3)

rows before dropna 5950
rows after dropna 5950
wrote duckdb table step16_modeling_frame


Unnamed: 0,season,week,team,Inj_Def_Next_w,Inj_Off_Next_w,Inj_Def_Last_w,Inj_Off_Last_w,blowout_flag_w,short_week_flag_w,bye_last_week_flag_w,...,ST_Shock_NonScore_w_minus_2,ST_Shock_NonScore_w_minus_3,points_for,points_against,score_diff_w,off_yards_per_play_w,Cumulative_Workload_Index_w,season_week,shock_nonscore,shock_x_blowout
0,2012,1,ATL,2.0,2.0,0.0,0.0,1,0,0,...,0,0,40,24,16,6.836364,-3.940011,201201,0,0
1,2012,2,ATL,3.0,2.0,2.0,2.0,0,0,0,...,0,0,27,21,6,4.412698,-3.638251,201202,0,0
2,2012,3,ATL,2.0,2.0,3.0,2.0,1,0,0,...,0,0,27,3,24,5.565217,-2.938346,201203,0,0


We compute the first pass mean and variance checks for both outcomes so that we can see unconditional overdispersion before running model based tests

In [5]:
def outcome_dispersion_stats(y: pd.Series) -> dict:
    y = y.astype(float)
    mean = float(y.mean())
    var = float(y.var(ddof=1))
    share_zero = float((y == 0).mean())
    return {
        "mean": mean,
        "var": var,
        "var_over_mean": (var / mean) if mean > 0 else np.nan,
        "share_zero": share_zero,
        "max": float(y.max()),
    }

stats_def = outcome_dispersion_stats(df[OUTCOME_DEF])
stats_off = outcome_dispersion_stats(df[OUTCOME_OFF])

print("defense outcome", OUTCOME_DEF)
for k in ["mean", "var", "var_over_mean", "share_zero", "max"]:
    print(k, stats_def[k])

print()
print("offense outcome", OUTCOME_OFF)
for k in ["mean", "var", "var_over_mean", "share_zero", "max"]:
    print(k, stats_off[k])

defense outcome Inj_Def_Next_w
mean 2.083865546218487
var 2.380761656150105
var_over_mean 1.1424737361152615
share_zero 0.15529411764705883
max 10.0

offense outcome Inj_Off_Next_w
mean 1.9201680672268908
var 2.1612169830110557
var_over_mean 1.125535321568121
share_zero 0.17714285714285713
max 9.0
