We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
from pathlib import Path
import duckdb
import numpy as np
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf

DB_DIR = Path("../db")
DB_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

PANEL_TABLE = "team_week_panel"
MODEL_VIEW = "team_week_panel_nextweek_model"

print("db file", (DB_DIR / "nflpa.duckdb").resolve())

existing_views = set(con.execute("SHOW TABLES").df()["name"].tolist())
if MODEL_VIEW not in existing_views:
    raise RuntimeError("Missing model view team_week_panel_nextweek_model, run notebook 11 before step 13")

cols = con.execute("DESCRIBE team_week_panel").df()["column_name"].tolist()
cols_set = set(cols)

if "team_key" in cols_set:
    TEAM_COL = "team_key"
elif "team" in cols_set:
    TEAM_COL = "team"
else:
    raise RuntimeError("Could not find team column in team_week_panel, expected team_key or team")

con.execute("DROP VIEW IF EXISTS team_week_panel_nextweek_model")
con.execute("DROP TABLE IF EXISTS panel_next_week_flags")

con.execute(f"""
CREATE TABLE panel_next_week_flags AS
WITH base AS (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_key
  FROM team_week_panel
)
SELECT
  season,
  week,
  team_key,
  CASE
    WHEN EXISTS (
      SELECT 1
      FROM base b2
      WHERE b2.season = b1.season
        AND b2.team_key = b1.team_key
        AND b2.week = b1.week + 1
    )
    THEN 1
    ELSE 0
  END AS has_next_week
FROM base b1
""")

con.execute(f"""
CREATE VIEW team_week_panel_nextweek_model AS
SELECT
  p.*,
  f.has_next_week
FROM team_week_panel p
JOIN panel_next_week_flags f
  ON p.season = f.season
 AND p.week = f.week
 AND p.{TEAM_COL} = f.team_key
WHERE f.has_next_week = 1
""")

con.execute("SELECT COUNT(*) AS n FROM team_week_panel_nextweek_model").df()
con.execute(f"SELECT COUNT(*) AS n FROM {MODEL_VIEW}").df()

db file /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


Unnamed: 0,n
0,5950


Quick sanity check to confirm that 'has_next_week' is always 1 in the model view and that the view has unique season week team keys

In [2]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week
FROM {MODEL_VIEW}
""").df()

con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_key,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
) d
""").df()

Unnamed: 0,dup_rows
0,0


We load the modeling dataset from the model view and checks that required columns exist. We also select the safest 'NonScore' shock column for modeling when a rolling version exists

In [3]:
cols = con.execute(f"DESCRIBE {MODEL_VIEW}").df()["column_name"].tolist()
cols_set = set(cols)

TEAM_COL = "team" if "team" in cols_set else "team_key"
SEASON_COL = "season"
WEEK_COL = "week"

OUTCOME_OFF = "Inj_Off_Next_w"
OUTCOME_DEF = "Inj_Def_Next_w"

required_base = [
    SEASON_COL, WEEK_COL, TEAM_COL,
    OUTCOME_OFF, OUTCOME_DEF,
    "blowout_flag_w",
    "ST_Shock_NonScore_w",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "offensive_no_play_snaps_w",
    "defensive_no_play_snaps_w",
    "points_for",
    "points_against",
    "short_week_flag_w",
    "days_rest_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
]

missing = [c for c in required_base if c not in cols_set]
if missing:
    raise RuntimeError("Missing required columns in model view, " + ", ".join(missing))

SHOCK_COL_MAIN = "ST_Shock_NonScore_Roll_w" if "ST_Shock_NonScore_Roll_w" in cols_set else "ST_Shock_NonScore_w"
Z_COL_MAIN = "Z_ST_NonScore_Roll_w" if "Z_ST_NonScore_Roll_w" in cols_set else ("Z_ST_NonScore_w" if "Z_ST_NonScore_w" in cols_set else None)

print("team column", TEAM_COL)
print("shock column main", SHOCK_COL_MAIN)
print("z column main", Z_COL_MAIN)

team column team
shock column main ST_Shock_NonScore_w
z column main Z_ST_NonScore_w
