We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path
import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb under the repo root")

con = duckdb.connect(str(DB_FILE))

SEASON_COL = "season"
WEEK_COL = "week"
TEAM_COL = "team"

MODEL_VIEW = "team_week_panel_nextweek_model"

existing = set(con.execute("SHOW TABLES").df()["name"].tolist())
if MODEL_VIEW not in existing:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to create the model view")

print("connected db", str(DB_FILE))
print("model view", MODEL_VIEW)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
model view team_week_panel_nextweek_model


Quick sanity check to confirm that 'has_next_week' is always 1 in the model view and that the view has unique season week team keys

In [2]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week,
  SUM(CASE WHEN Inj_Def_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_def
FROM {MODEL_VIEW}
""").df()

con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    team AS team_key,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
) d
""").df()

Unnamed: 0,dup_rows
0,0


Quick sanity check to confirm that all required columns exist. We normalize any naming differences so formulas are stable

In [3]:
cols = con.execute(f"DESCRIBE {MODEL_VIEW}").df()["column_name"].tolist()
cols_set = set(cols)

def pick_first_present(options):
    for o in options:
        if o in cols_set:
            return o
    return None

TEAM_COL = "team" if "team" in cols_set else ("team_key" if "team_key" in cols_set else None)
if TEAM_COL is None:
    raise RuntimeError("Missing team column, expected team or team_key in model view")

POINTS_FOR_COL = pick_first_present(["points_for_w", "points_for"])
POINTS_AGAINST_COL = pick_first_present(["points_against_w", "points_against"])
SCORE_DIFF_COL = pick_first_present(["score_diff_w", "score_diff"])
OFF_YPP_COL = pick_first_present(["off_yards_per_play_w", "Off_yards_per_play_w"])
CWI_COL = pick_first_present(["Cumulative_Workload_Index_w", "cumulative_workload_index_w"])

if POINTS_FOR_COL is None or POINTS_AGAINST_COL is None:
    raise RuntimeError("Missing points columns, expected points_for and points_against variants")

if SCORE_DIFF_COL is None:
    raise RuntimeError("Missing score diff column, expected score_diff_w")

if OFF_YPP_COL is None:
    raise RuntimeError("Missing offensive yards per play column, expected off_yards_per_play_w")

if CWI_COL is None:
    raise RuntimeError("Missing workload index column, expected Cumulative_Workload_Index_w")

OUTCOME_DEF = "Inj_Def_Next_w"
LAG_COLS = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]

SHOCK_COL_MAIN = "ST_Shock_NonScore_Roll_w" if "ST_Shock_NonScore_Roll_w" in cols_set else "ST_Shock_NonScore_w"

required = [
    SEASON_COL, WEEK_COL, TEAM_COL,
    OUTCOME_DEF,
    "Inj_Def_Last_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    OFF_YPP_COL,
    SCORE_DIFF_COL,
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    CWI_COL,
] + LAG_COLS

missing = [c for c in required if c not in cols_set]
if missing:
    raise RuntimeError("Missing required Step 14 columns in model view, " + ", ".join(missing))

print("team column", TEAM_COL)
print("shock column main", SHOCK_COL_MAIN)
print("points_for column", POINTS_FOR_COL)
print("points_against column", POINTS_AGAINST_COL)
print("score_diff column", SCORE_DIFF_COL)
print("off_ypp column", OFF_YPP_COL)
print("workload index column", CWI_COL)

team column team
shock column main ST_Shock_NonScore_w
points_for column points_for
points_against column points_against
score_diff column score_diff_w
off_ypp column off_yards_per_play_w
workload index column Cumulative_Workload_Index_w
