We open the existing DuckDB file so all downstream tables are persisted in a single database and the notebook runs against a stable state

In [1]:
from pathlib import Path
import duckdb

DB_DIR = Path("../db")
con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

print("connected")

connected


Quick sanity check to verify that the schedules relation is accessible and non empty before any transformations are attempted.

In [2]:
print("schedules", con.execute("SELECT COUNT(*) FROM schedules").fetchone()[0])

schedules 3544


We expand each game into two team perspective rows and establishes the one row per team per week structure with the required primary keys

In [3]:
con.execute("""
CREATE OR REPLACE TABLE team_game_weeks AS
WITH base AS (
  SELECT
    s.season,
    s.week,
    s.game_id,
    TRY_CAST(CAST(s.gameday AS VARCHAR) AS DATE) AS game_date,
    s.home_team,
    s.away_team,
    s.home_score,
    s.away_score
  FROM schedules s
)
SELECT
  season,
  week,
  game_id,
  game_date,
  home_team AS team,
  away_team AS opponent,
  1 AS home_flag,
  0 AS away_flag,
  home_score AS points_for,
  away_score AS points_against
FROM base

UNION ALL

SELECT
  season,
  week,
  game_id,
  game_date,
  away_team AS team,
  home_team AS opponent,
  0 AS home_flag,
  1 AS away_flag,
  away_score AS points_for,
  home_score AS points_against
FROM base
""")

print("team_game_weeks", con.execute("SELECT COUNT(*) FROM team_game_weeks").fetchone()[0])

team_game_weeks 7088


Quick sanity check that ensures the generated primary key, season, week, team, game_id is unique and confirms there are no accidental duplicates

In [4]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT CAST(season AS VARCHAR) || '-' || CAST(week AS VARCHAR) || '-' || team || '-' || game_id) AS distinct_keys
FROM team_game_weeks
""").df()

Unnamed: 0,rows,distinct_keys
0,7088,7088


We checkpoint the database to reduce wal growth and closes the connection cleanly

In [5]:
con.execute("CHECKPOINT")
con.close()

print("db successfully closed")

db successfully closed


For each team week, attach game result and stats, injury outcomes, rest and schedule variables

We derive team perspective game result features that are used as model controls and reporting variables

In [6]:
DB_DIR = Path("../db")
con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

_ = con.execute("""
CREATE OR REPLACE TABLE team_game_weeks_results AS
SELECT
  season,
  week,
  team,
  opponent,
  game_id,
  game_date,0
  home_flag,
  away_flag,
  points_for,
  points_against,
  points_for - points_against AS score_diff,
  CASE WHEN points_for > points_against THEN 1 ELSE 0 END AS win_flag,
  CASE WHEN points_for = points_against THEN 1 ELSE 0 END AS tie_flag,
  CASE WHEN ABS(points_for - points_against) >= 14 THEN 1 ELSE 0 END AS blowout_flag
FROM team_game_weeks
""")

print("created team_game_weeks_results")

con.execute("SHOW TABLES").df()
con.execute("SELECT * FROM team_game_weeks LIMIT 5").df()

created team_game_weeks_results


Unnamed: 0,season,week,game_id,game_date,team,opponent,home_flag,away_flag,points_for,points_against
0,2012,1,2012_01_DAL_NYG,2012-09-05,NYG,DAL,1,0,17,24
1,2012,1,2012_01_IND_CHI,2012-09-09,CHI,IND,1,0,41,21
2,2012,1,2012_01_PHI_CLE,2012-09-09,CLE,PHI,1,0,16,17
3,2012,1,2012_01_STL_DET,2012-09-09,DET,STL,1,0,27,23
4,2012,1,2012_01_MIA_HOU,2012-09-09,HOU,MIA,1,0,30,10


Quick sanity check to check blowout rates. Blowout rate is important because using the blowout rate, we can get a flag for blowouts during games. A blowout flag is necessary because score situation changes how much players are exposed to risk and workload in a game and additionally, change special teams volume

In [7]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  AVG(blowout_flag) AS blowout_rate
FROM team_game_weeks_results
""").df()

Unnamed: 0,rows,blowout_rate
0,7088,0.349605


We build a rest and scheduling table by adding prior game context, then computing days rest, short week, and bye last week flags for each team week

In [8]:
DB_DIR = Path("../db")
con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

con.execute("""
CREATE OR REPLACE TABLE team_game_weeks_rest AS
WITH ordered AS (
  SELECT
    *,
    LAG(game_date) OVER (PARTITION BY season, team ORDER BY week) AS prev_game_date,
    LAG(week) OVER (PARTITION BY season, team ORDER BY week) AS prev_week
  FROM team_game_weeks_results
)
SELECT
  season,
  week,
  team,
  opponent,
  game_id,
  game_date,
  home_flag,
  away_flag,
  points_for,
  points_against,
  score_diff,
  win_flag,
  tie_flag,
  blowout_flag,
  DATE_DIFF('day', prev_game_date, game_date) AS days_rest,
  CASE WHEN DATE_DIFF('day', prev_game_date, game_date) <= 4 THEN 1 ELSE 0 END AS short_week_flag,
  CASE WHEN prev_week IS NOT NULL AND week - prev_week > 1 THEN 1 ELSE 0 END AS bye_last_week_flag
FROM ordered
""")

<_duckdb.DuckDBPyConnection at 0x113026bf0>

Quick sanity check to confirm the rest day calculations look realistic across the dataset

In [9]:
con.execute("""
SELECT
  MIN(days_rest) AS min_days_rest,
  MAX(days_rest) AS max_days_rest,
  AVG(days_rest) AS avg_days_rest
FROM team_game_weeks_rest
WHERE days_rest IS NOT NULL
""").df()

Unnamed: 0,min_days_rest,max_days_rest,avg_days_rest
0,4,17,7.505695


We create a per team week stats table from play by play. Using this, we can count offensive and defensive scrimmage snaps faced by a specific team as well as compute that specific team's offensive yards per play

In [10]:
DB_DIR = Path("../db")
con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

con.execute("""
CREATE OR REPLACE TABLE team_week_basic_stats AS
WITH scrimmage AS (
  SELECT
    season,
    week,
    posteam AS team,
    yards_gained
  FROM pbp
  WHERE posteam IS NOT NULL
    AND play_type IN ('run','pass','sack','qb_kneel','qb_spike')
),
def_scrimmage AS (
  SELECT
    season,
    week,
    defteam AS team
  FROM pbp
  WHERE defteam IS NOT NULL
    AND play_type IN ('run','pass','sack','qb_kneel','qb_spike')
)
SELECT
  s.season,
  s.week,
  s.team,
  COUNT(*) AS offensive_snaps,
  SUM(COALESCE(s.yards_gained, 0)) * 1.0 / NULLIF(COUNT(*), 0) AS off_yards_per_play,
  (
    SELECT COUNT(*)
    FROM def_scrimmage d
    WHERE d.season = s.season AND d.week = s.week AND d.team = s.team
  ) AS defensive_snaps
FROM scrimmage s
GROUP BY 1,2,3
""")

<_duckdb.DuckDBPyConnection at 0x113032bb0>

Quick sanity check on the created stats table to confirm that the row count exists and the yards per play range is within a reasonable numeric range for testing

In [11]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  MIN(off_yards_per_play) AS min_ypp,
  MAX(off_yards_per_play) AS max_ypp
FROM team_week_basic_stats
""").df()

Unnamed: 0,rows,min_ypp,max_ypp
0,7088,1.119048,10.304348


We build a roster-based side mapping table. We assign each player week to either OFF, DEF, ST, or OTHER using roster position fields.

In [12]:
B_DIR = Path("../db")
con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

con.execute("""
CREATE OR REPLACE TABLE roster_player_side AS
SELECT
  season,
  week,
  team,
  gsis_id,
  COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) AS pos,
  CASE
    WHEN COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) IN ('QB','RB','FB','WR','TE','C','G','T','OL') THEN 'OFF'
    WHEN COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) IN ('DL','DE','DT','NT','LB','ILB','OLB','CB','S','FS','SS') THEN 'DEF'
    WHEN COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) IN ('K','P','LS') THEN 'ST'
    ELSE 'OTHER'
  END AS side
FROM rosters_weekly
WHERE gsis_id IS NOT NULL
""")

<_duckdb.DuckDBPyConnection at 0x113038370>

Quick sanity check on the side mapping distribution by confirming that the roster rows are being classified

In [13]:
con.execute("""
SELECT side, COUNT(*) AS n
FROM roster_player_side
GROUP BY 1
ORDER BY n DESC
""").df()

Unnamed: 0,side,n
0,OFF,254630
1,OTHER,134044
2,DEF,131818
3,ST,26221


We create a unique injury presence dataset by removing duplicates across team-week-player cohorts, providing a foundation for our detection model

In [14]:
con.execute("""
CREATE OR REPLACE TABLE injuries_team_week_players AS
SELECT DISTINCT
  season,
  week,
  team,
  gsis_id
FROM injuries
WHERE team IS NOT NULL
  AND gsis_id IS NOT NULL
""")

<_duckdb.DuckDBPyConnection at 0x113038370>

We computed new injuries at the team-week level by subtracting players listed the previous week and then totaling new offensive and defensive injuries using roster side mapping

In [15]:
con.execute("""
CREATE OR REPLACE TABLE new_injuries_next_week AS
WITH curr AS (
  SELECT season, week, team, gsis_id
  FROM injuries_team_week_players
),
nxt AS (
  SELECT season, week AS week_next, team, gsis_id
  FROM injuries_team_week_players
),
new_only AS (
  SELECT
    nxt.season,
    nxt.week_next,
    nxt.team,
    nxt.gsis_id
  FROM nxt
  LEFT JOIN curr
    ON curr.season = nxt.season
   AND curr.week = nxt.week_next - 1
   AND curr.team = nxt.team
   AND curr.gsis_id = nxt.gsis_id
  WHERE curr.gsis_id IS NULL
)
SELECT
  new_only.season,
  new_only.week_next - 1 AS week,
  new_only.team,
  SUM(CASE WHEN r.side = 'OFF' THEN 1 ELSE 0 END) AS inj_off_next,
  SUM(CASE WHEN r.side = 'DEF' THEN 1 ELSE 0 END) AS inj_def_next
FROM new_only
LEFT JOIN roster_player_side r
  ON r.season = new_only.season
 AND r.week = new_only.week_next
 AND r.team = new_only.team
 AND r.gsis_id = new_only.gsis_id
GROUP BY 1,2,3
""")

<_duckdb.DuckDBPyConnection at 0x113038370>

Quick sanity checks on the new injury outcome table by confirming row coverage and checks that average counts are in a plausible range

In [16]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  AVG(inj_off_next) AS avg_off,
  AVG(inj_def_next) AS avg_def
FROM new_injuries_next_week
""").df()

Unnamed: 0,rows,avg_off,avg_def
0,6965,2.057717,1.0972


We produced the single canonical panel table used in every later workflow step, with schedule, rest, workload controls, and injury outcomes attached

In [17]:
DB_DIR = Path("../db")
con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

con.execute("""
CREATE OR REPLACE TABLE team_week_panel AS
SELECT
  g.season,
  g.week,
  g.team,
  g.opponent,
  g.game_id,
  g.game_date,
  g.home_flag,
  g.away_flag,
  g.points_for,
  g.points_against,
  g.score_diff,
  g.win_flag,
  g.tie_flag,
  g.blowout_flag,
  g.days_rest,
  g.short_week_flag,
  g.bye_last_week_flag,
  COALESCE(b.offensive_snaps, 0) AS offensive_snaps,
  COALESCE(b.defensive_snaps, 0) AS defensive_snaps,
  b.off_yards_per_play,
  COALESCE(n.inj_off_next, 0) AS inj_off_next,
  COALESCE(n.inj_def_next, 0) AS inj_def_next
FROM team_game_weeks_rest g
LEFT JOIN team_week_basic_stats b
  ON b.season = g.season AND b.week = g.week AND b.team = g.team
LEFT JOIN new_injuries_next_week n
  ON n.season = g.season AND n.week = g.week AND n.team = g.team
""")

print("team_week_panel", con.execute("SELECT COUNT(*) FROM team_week_panel").fetchone()[0])

team_week_panel 7088


Quick sanity check to confirm that the final table is truly one row per team week and validates the output schema with a quick sample read

In [18]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT CAST(season AS VARCHAR) || '-' || CAST(week AS VARCHAR) || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

Unnamed: 0,rows,distinct_keys
0,7088,7088


In [19]:
con.execute("""
SELECT *
FROM team_week_panel
ORDER BY season DESC, week DESC
LIMIT 5
""").df()

Unnamed: 0,season,week,team,opponent,game_id,game_date,home_flag,away_flag,points_for,points_against,...,tie_flag,blowout_flag,days_rest,short_week_flag,bye_last_week_flag,offensive_snaps,defensive_snaps,off_yards_per_play,inj_off_next,inj_def_next
0,2024,22,PHI,KC,2024_22_KC_PHI,2025-02-09,0,0,40,22,...,0,1,14,0,0,70,51,4.928571,0.0,0.0
1,2024,22,KC,PHI,2024_22_KC_PHI,2025-02-09,0,1,22,40,...,0,1,14,0,0,51,70,5.431373,0.0,0.0
2,2024,21,PHI,WAS,2024_21_WAS_PHI,2025-01-26,0,0,55,23,...,0,1,7,0,0,67,79,6.850746,4.0,1.0
3,2024,21,KC,BUF,2024_21_BUF_KC,2025-01-26,0,0,32,29,...,0,0,8,0,0,64,70,5.78125,1.0,0.0
4,2024,21,BUF,KC,2024_21_BUF_KC,2025-01-26,0,1,29,32,...,0,0,7,0,0,70,64,5.342857,0.0,0.0


Close the database

In [20]:
con.execute("CHECKPOINT")
con.close()

print("done")

done
