We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import os
from pathlib import Path
import duckdb

print("cwd", Path().resolve())

candidates = []
search_roots = [
    Path("."),
    Path(".."),
    Path("./data"),
    Path("../data"),
    Path("../../data"),
]
for root in search_roots:
    if root.exists():
        candidates.extend(list(root.glob("*.duckdb")))
        candidates.extend(list(root.glob("**/*.duckdb")))

seen = set()
duckdb_files = []
for f in candidates:
    fp = str(f.resolve())
    if fp not in seen:
        seen.add(fp)
        duckdb_files.append(f.resolve())

print("duckdb files found")
for i, f in enumerate(duckdb_files[:25]):
    print(i, f)

db_file = None
for f in duckdb_files:
    if f.name == "nflpa.duckdb":
        db_file = f
        break

if db_file is None and duckdb_files:
    db_file = duckdb_files[0]

if db_file is None:
    raise RuntimeError("No duckdb file found near this notebook, rerun notebook 02 or check where you stored the database file")

con = duckdb.connect(str(db_file))
print("connected db", db_file)

tables = con.execute("SHOW TABLES").df()
tables

cwd /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/notebooks
duckdb files found
0 /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
1 /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/data/nflpa.duckdb
connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


Unnamed: 0,name
0,injuries
1,injuries_players_distinct
2,injuries_team_week_players
3,injury_counts_next
4,injury_counts_next_game
5,injury_counts_next_game_with_lags
6,injury_counts_with_lags
7,injury_outcomes_next_game_tmp
8,injury_outcomes_tmp
9,injury_players_next_game


We define small helpers for column discovery and strict required column checks so later sanity checks fail early if an upstream notebook was skipped or a column name drifted

In [2]:
import pandas as pd

SEASON_COL = "season"
WEEK_COL = "week"
TEAM_COL = "team"
PANEL_TABLE = "team_week_panel"

def _existing_cols(table_name: str) -> list[str]:
    return con.execute(f"DESCRIBE {table_name}").df()["column_name"].tolist()

def _require_cols(table_name: str, required: list[str]) -> None:
    cols = set(_existing_cols(table_name))
    missing = [c for c in required if c not in cols]
    print("Missing required columns", missing)
    if missing:
        raise RuntimeError(f"Missing columns in {table_name}, rerun earlier notebooks, missing, {missing}")

def _cols_matching(prefix: str = "", contains: str = "", suffix: str = "") -> list[str]:
    cols = _existing_cols(PANEL_TABLE)
    out = []
    for c in cols:
        if prefix and not c.startswith(prefix):
            continue
        if contains and contains not in c:
            continue
        if suffix and not c.endswith(suffix):
            continue
        out.append(c)
    return out

print("team week panel columns", len(_existing_cols(PANEL_TABLE)))

team week panel columns 152


Quick sanity check to confirm that the wide-format structure correctly reflects the expected mix of injury, workload, and control features

In [3]:
con.execute("DESCRIBE team_week_panel").df().head(30)

con.execute("""
SELECT
  column_name,
  column_type
FROM (DESCRIBE team_week_panel)
ORDER BY column_name
LIMIT 30
""").df()

Unnamed: 0,column_name,column_type
0,Cum_Shocks_All_w,HUGEINT
1,Cum_Shocks_NonScore_w,HUGEINT
2,Cum_Shocks_ScoreLinked_w,HUGEINT
3,Cumulative_Workload_Index_pg_w,DOUBLE
4,Cumulative_Workload_Index_w,DOUBLE
5,Inj_Def_LastGame_w,DOUBLE
6,Inj_Def_Last_w,DOUBLE
7,Inj_Def_NextGame_w,DOUBLE
8,Inj_Def_Next_w,DOUBLE
9,Inj_Off_LastGame_w,DOUBLE


Quick sanity check to confirm that both schema views contain the same columns and types

In [4]:
df_a = con.execute("DESCRIBE team_week_panel").df()
df_b = con.execute("""
SELECT column_name, column_type
FROM (DESCRIBE team_week_panel)
ORDER BY column_name
""").df()

print("rows describe", len(df_a))
print("rows sorted", len(df_b))

dupe_names = df_a["column_name"].value_counts()
dupe_names = dupe_names[dupe_names > 1]
print("duplicate column names", dupe_names.to_dict())

mismatch = (
    df_a[["column_name", "column_type"]]
    .drop_duplicates()
    .merge(
        df_b[["column_name", "column_type"]].drop_duplicates(),
        on=["column_name", "column_type"],
        how="outer",
        indicator=True,
    )
)
mismatch = mismatch[mismatch["_merge"] != "both"]
mismatch.head(50)

rows describe 152
rows sorted 152
duplicate column names {}


Unnamed: 0,column_name,column_type,_merge


We verify the continuity of the dataset by aggregating the unique time periods present to ensure that the panel spans the full historical range required for the analysis

In [5]:
con.execute(f"""
SELECT
  SUM(CASE WHEN name = '{PANEL_TABLE}' THEN 1 ELSE 0 END) AS has_team_week_panel
FROM (SHOW TABLES)
""").df()

_require_cols(
    PANEL_TABLE,
    [
        SEASON_COL,
        WEEK_COL,
        TEAM_COL,
        "game_id",
        "points_for",
        "points_against",
        "ST_Load_All_w",
        "ST_Load_ScoreLinked_w",
        "ST_Load_NonScore_w",
        "ST_Punt_w",
        "ST_PuntReturn_w",
        "ST_Kickoff_w",
        "ST_KickReturn_w",
        "ST_FG_w",
        "ST_XP_w",
        "ST_Rare_w",
        "Inj_Off_Next_w",
        "Inj_Def_Next_w",
    ],
)

con.execute(f"""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT {SEASON_COL}) AS seasons,
  MIN({SEASON_COL}) AS min_season,
  MAX({SEASON_COL}) AS max_season,
  MIN({WEEK_COL}) AS min_week,
  MAX({WEEK_COL}) AS max_week
FROM {PANEL_TABLE}
""").df()

Missing required columns []


Unnamed: 0,rows,seasons,min_season,max_season,min_week,max_week
0,6782,13,2012,2024,1,18


Quick sanity check to confirm that 'team_week_panel' exists in the connected database 

In [6]:
con.execute("""
SELECT
  SUM(CASE WHEN name = 'team_week_panel' THEN 1 ELSE 0 END) AS has_team_week_panel
FROM (SHOW TABLES)
""").df()

Unnamed: 0,has_team_week_panel
0,1.0


Quick sanity check to confirm that 'team_week_panel' has no duplicate season-week-team rows and that the key count equals the row count to ensure that the final modeling table is a perfectly unique panel

In [7]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

dups = con.execute("""
SELECT
  season,
  week,
  team,
  COUNT(*) AS n
FROM team_week_panel
GROUP BY 1,2,3
HAVING COUNT(*) > 1
ORDER BY n DESC, season, week, team
LIMIT 50
""").df()

dups

if len(dups) > 0:
    raise RuntimeError("Duplicate season week team rows exist in team_week_panel, investigate joins before proceeding")

We verify that the "next game" indicator accurately distinguishes between teams that play in the following calendar week and those heading into a bye to ensure that the 'w+a' outcome logic aligns with the physical game schedule

In [8]:
con.execute("DROP VIEW IF EXISTS panel_next_week_flags")

con.execute(f"""
CREATE TEMP VIEW panel_next_week_flags AS
WITH base AS (
  SELECT
    {SEASON_COL} AS season,
    {WEEK_COL} AS week,
    CAST({TEAM_COL} AS VARCHAR) AS team_key
  FROM {PANEL_TABLE}
),
nxt AS (
  SELECT
    {SEASON_COL} AS season,
    {WEEK_COL} AS week,
    CAST({TEAM_COL} AS VARCHAR) AS team_key
  FROM {PANEL_TABLE}
)
SELECT
  b.season,
  b.week,
  b.team_key,
  CASE WHEN n.week IS NULL THEN 0 ELSE 1 END AS has_next_week
FROM base b
LEFT JOIN nxt n
  ON n.season = b.season
 AND n.team_key = b.team_key
 AND n.week = b.week + 1
""")

con.execute("""
SELECT
  has_next_week,
  COUNT(*) AS n_rows
FROM panel_next_week_flags
GROUP BY 1
ORDER BY 1
""").df()

Unnamed: 0,has_next_week,n_rows
0,0,832
1,1,5950


Quick sanity check to confirm that next week injury outcomes are only defined when week 'w+a' exists for that team season and that they are null when the next week row is missing

In [9]:
inj_next_week_cols = ["Inj_Off_Next_w", "Inj_Def_Next_w"]
cols_now = set(_existing_cols(PANEL_TABLE))
inj_next_week_cols = [c for c in inj_next_week_cols if c in cols_now]

if not inj_next_week_cols:
    raise RuntimeError("No next week injury outcome columns found, expected Inj_Off_Next_w and Inj_Def_Next_w")

print("Next week injury columns checked", inj_next_week_cols)

select_any_defined = " OR ".join([f"p.{c} IS NOT NULL" for c in inj_next_week_cols])

bad_defined = con.execute(f"""
SELECT
  COUNT(*) AS bad_rows
FROM {PANEL_TABLE} p
JOIN panel_next_week_flags f
  ON f.season = p.{SEASON_COL}
 AND f.week = p.{WEEK_COL}
 AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
WHERE f.has_next_week = 0
  AND ({select_any_defined})
""").df()

bad_rows = int(bad_defined["bad_rows"].iloc[0])
print(bad_defined)

if bad_rows != 0:
    sample = con.execute(f"""
    SELECT
      p.{SEASON_COL} AS season,
      p.{WEEK_COL} AS week,
      p.{TEAM_COL} AS team,
      f.has_next_week,
      {", ".join([f"p.{c}" for c in inj_next_week_cols])}
    FROM {PANEL_TABLE} p
    JOIN panel_next_week_flags f
      ON f.season = p.{SEASON_COL}
     AND f.week = p.{WEEK_COL}
     AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
    WHERE f.has_next_week = 0
      AND ({select_any_defined})
    ORDER BY p.{SEASON_COL} DESC, p.{WEEK_COL} DESC, p.{TEAM_COL}
    LIMIT 50
    """).df()
    print(sample)
    raise RuntimeError("Next week injury outcomes are populated when week plus 1 does not exist, fix notebook 09 logic or apply the step 11 repair cell")

Next week injury columns checked ['Inj_Off_Next_w', 'Inj_Def_Next_w']
   bad_rows
0         0


We handle the undefined next week cases by creating a modeling view that drops all rows where week 'w+1' does not exist, which are includes final game weeks and pre bye game weeks.

In [10]:
con.execute("DROP VIEW IF EXISTS team_week_panel_nextweek_model")

con.execute(f"""
CREATE VIEW team_week_panel_nextweek_model AS
SELECT
  p.*,
  f.has_next_week
FROM {PANEL_TABLE} p
JOIN panel_next_week_flags f
  ON f.season = p.{SEASON_COL}
 AND f.week = p.{WEEK_COL}
 AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
WHERE f.has_next_week = 1
""")

con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  COUNT(DISTINCT {SEASON_COL} || '-' || {WEEK_COL} || '-' || {TEAM_COL}) AS distinct_keys_model
FROM team_week_panel_nextweek_model
""").df()

con.execute(f"""
SELECT
  p.{SEASON_COL} AS season,
  COUNT(*) AS rows_total,
  SUM(CASE WHEN f.has_next_week = 1 THEN 1 ELSE 0 END) AS rows_kept,
  SUM(CASE WHEN f.has_next_week = 0 THEN 1 ELSE 0 END) AS rows_dropped
FROM {PANEL_TABLE} p
JOIN panel_next_week_flags f
  ON f.season = p.{SEASON_COL}
 AND f.week = p.{WEEK_COL}
 AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
GROUP BY 1
ORDER BY 1
""").df()

Unnamed: 0,season,rows_total,rows_kept,rows_dropped
0,2012,512,448.0,64.0
1,2013,512,448.0,64.0
2,2014,512,448.0,64.0
3,2015,512,448.0,64.0
4,2016,512,448.0,64.0
5,2017,512,450.0,62.0
6,2018,512,448.0,64.0
7,2019,512,448.0,64.0
8,2020,512,448.0,64.0
9,2021,544,480.0,64.0
