We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd
import numpy as np

from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

Quick sanity check to confirm that the required tables exist before building any outcomes

In [None]:
required_tables = ["team_week_panel", "injuries", "rosters_weekly"]

tables = set(con.execute("SHOW TABLES").df()["name"].tolist())
missing = [t for t in required_tables if t not in tables]

print("Missing tables", missing)

if missing:
    raise RuntimeError("Step 9 cannot run because tables are missing, missing are " + ", ".join(missing))

We detect key columns dynamically so the logic does not assume specific team or player identifier names, while ensuring the injury counts are calculated accurately regardless of whatever the columns are named

In [None]:
def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _pick_first(existing, candidates, label):
    for c in candidates:
        if c in existing:
            return c
    raise RuntimeError("Could not find a usable column for " + label + ", candidates were " + ", ".join(candidates))

def _pick_common(existing_a, existing_b, candidates, label):
    for c in candidates:
        if c in existing_a and c in existing_b:
            return c
    raise RuntimeError("Could not find a common column for " + label + ", candidates were " + ", ".join(candidates))

panel_cols = _existing_cols("team_week_panel")
inj_cols = _existing_cols("injuries")
rost_cols = _existing_cols("rosters_weekly")

TEAM_COL = "team_id" if "team_id" in panel_cols else "team"
TEAM_ABBR_COL = "team" if "team" in panel_cols else TEAM_COL

print("Using TEAM_COL", TEAM_COL)
print("Using TEAM_ABBR_COL", TEAM_ABBR_COL)

inj_team_candidates = [TEAM_ABBR_COL, "team", "team_abbr", "posteam", "club", "team_id"]
rost_team_candidates = [TEAM_ABBR_COL, "team", "team_abbr", "posteam", "club", "team_id"]

INJ_TEAM_COL = _pick_first(inj_cols, inj_team_candidates, "injuries team column")
ROST_TEAM_COL = _pick_first(rost_cols, rost_team_candidates, "rosters_weekly team column")

season_candidates = ["season", "year"]
week_candidates = ["week"]

INJ_SEASON_COL = _pick_first(inj_cols, season_candidates, "injuries season column")
INJ_WEEK_COL = _pick_first(inj_cols, week_candidates, "injuries week column")

ROST_SEASON_COL = _pick_first(rost_cols, season_candidates, "rosters_weekly season column")
ROST_WEEK_COL = _pick_first(rost_cols, week_candidates, "rosters_weekly week column")

player_key_candidates = [
    "player_id",
    "gsis_id",
    "nfl_id",
    "pfr_id",
    "esb_id",
    "sportradar_id",
]

PLAYER_KEY_COL = _pick_common(inj_cols, rost_cols, player_key_candidates, "player key between injuries and rosters_weekly")

pos_candidates = ["position", "pos", "position_abbr"]
ROST_POS_COL = None
for c in pos_candidates:
    if c in rost_cols:
        ROST_POS_COL = c
        break

players_table_exists = "players" in tables
PLAYERS_PLAYER_KEY_COL = None
PLAYERS_POS_COL = None

if players_table_exists:
    ply_cols = _existing_cols("players")
    for c in player_key_candidates:
        if c in ply_cols:
            PLAYERS_PLAYER_KEY_COL = c
            break
    for c in pos_candidates:
        if c in ply_cols:
            PLAYERS_POS_COL = c
            break

print("Using INJ_TEAM_COL", INJ_TEAM_COL)
print("Using ROST_TEAM_COL", ROST_TEAM_COL)
print("Using PLAYER_KEY_COL", PLAYER_KEY_COL)
print("Using ROST_POS_COL", ROST_POS_COL)
print("Players table exists", players_table_exists)
print("Using PLAYERS_PLAYER_KEY_COL", PLAYERS_PLAYER_KEY_COL)
print("Using PLAYERS_POS_COL", PLAYERS_POS_COL)

Quick sanity check to confirm that the intersection of unique team identifiers across the panel and injury data is nearly 100%, preventing a scenario where injuries are "lost" because of mismatched abbreviations

In [None]:
panel_teams = con.execute(f"SELECT DISTINCT {TEAM_ABBR_COL} AS team_key FROM team_week_panel").df()
inj_teams = con.execute(f"SELECT DISTINCT {INJ_TEAM_COL} AS team_key FROM injuries").df()

panel_set = set(panel_teams["team_key"].dropna().astype(str).tolist())
inj_set = set(inj_teams["team_key"].dropna().astype(str).tolist())

only_in_panel = sorted(list(panel_set - inj_set))[:50]
only_in_inj = sorted(list(inj_set - panel_set))[:50]

print("Example teams only in team_week_panel", only_in_panel)
print("Example teams only in injuries", only_in_inj)
print("Panel team count", len(panel_set))
print("Injuries team count", len(inj_set))
print("Overlap team count", len(panel_set.intersection(inj_set)))

We build a cleaned version of the raw injury reports by removing duplicate player entries within the same game week to ensure that our starting point for counting new injuries is mathematically unique

In [None]:
con.execute("DROP TABLE IF EXISTS injuries_players_distinct")

con.execute(f"""
CREATE TABLE injuries_players_distinct AS
SELECT DISTINCT
  CAST({INJ_SEASON_COL} AS INTEGER) AS season,
  CAST({INJ_WEEK_COL} AS INTEGER) AS week,
  CAST({INJ_TEAM_COL} AS VARCHAR) AS team_key,
  CAST({PLAYER_KEY_COL} AS VARCHAR) AS player_key
FROM injuries
WHERE {INJ_SEASON_COL} IS NOT NULL
  AND {INJ_WEEK_COL} IS NOT NULL
  AND {INJ_TEAM_COL} IS NOT NULL
  AND {PLAYER_KEY_COL} IS NOT NULL
""")

print(con.execute("SELECT COUNT(*) AS rows FROM injuries_players_distinct").df())

We implement a lookup that attaches a list of all currently injured players to every team-week record, which allows the model to identify who was already sidelined before calculating next-week outcomes

In [None]:
con.execute("DROP TABLE IF EXISTS injury_players_w")

con.execute(f"""
CREATE TABLE injury_players_w AS
SELECT
  p.season,
  p.week,
  CAST(p.{TEAM_ABBR_COL} AS VARCHAR) AS team_key,
  i.player_key
FROM team_week_panel p
JOIN injuries_players_distinct i
  ON i.season = p.season
 AND i.week = p.week
 AND i.team_key = CAST(p.{TEAM_ABBR_COL} AS VARCHAR)
""")

print(con.execute("SELECT COUNT(*) AS rows FROM injury_players_w").df())