We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
from pathlib import Path
import duckdb

DB_DIR = Path("../db")
DB_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

Quick sanity check to confirm that the required raw tables already exist in DuckDB before building derived tables.

In [2]:
required_tables = ["pbp", "schedules", "injuries", "snap_counts", "players", "rosters_weekly"]
existing = set(con.execute("SHOW TABLES").df()["name"].tolist())
missing = [t for t in required_tables if t not in existing]

print("Missing tables", missing)
print("OK" if not missing else "Fix step 2 before continuing")

Missing tables []
OK


We create the primary panel by generating one row per team for every game week and filtering the schedules to include only regular-season data

In [3]:
con.execute("""
CREATE OR REPLACE TABLE team_game_weeks AS
WITH base AS (
  SELECT
    season,
    week,
    game_id,
    TRY_CAST(gameday AS DATE) AS game_date,
    game_type,
    home_team,
    away_team,
    home_score,
    away_score,
    stadium_id,
    stadium,
    location,
    roof,
    surface,
    temp,
    wind,
    div_game
  FROM schedules
  WHERE game_type = 'REG'
)
SELECT
  season,
  week,
  game_id,
  game_date,
  game_type,
  home_team AS team,
  away_team AS opponent,
  1 AS home_flag,
  0 AS away_flag,
  home_score AS points_for,
  away_score AS points_against,
  stadium_id,
  stadium,
  location,
  roof,
  surface,
  temp,
  wind,
  div_game
FROM base

UNION ALL

SELECT
  season,
  week,
  game_id,
  game_date,
  game_type,
  away_team AS team,
  home_team AS opponent,
  0 AS home_flag,
  1 AS away_flag,
  away_score AS points_for,
  home_score AS points_against,
  stadium_id,
  stadium,
  location,
  roof,
  surface,
  temp,
  wind,
  div_game
FROM base
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

Quick sanity check to confirm that the row count and week range in the primary panel align with the expected regular-season schedule across all seasons

In [4]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season) AS seasons,
  MIN(week) AS min_week,
  MAX(week) AS max_week
FROM team_game_weeks
""").df()

Unnamed: 0,rows,seasons,min_week,max_week
0,6782,13,1,18


We derive win-loss outcomes and game script metrics from the team-level game rows

In [5]:
con.execute("""
CREATE OR REPLACE TABLE team_game_weeks_results AS
SELECT
  season,
  week,
  team,
  opponent,
  game_id,
  game_date,
  game_type,
  home_flag,
  away_flag,
  points_for,
  points_against,
  points_for - points_against AS score_diff,
  CASE WHEN points_for > points_against THEN 1 ELSE 0 END AS win_flag,
  CASE WHEN points_for = points_against THEN 1 ELSE 0 END AS tie_flag,
  CASE WHEN ABS(points_for - points_against) >= 14 THEN 1 ELSE 0 END AS blowout_flag,
  stadium_id,
  stadium,
  location,
  roof,
  surface,
  temp,
  wind,
  div_game
FROM team_game_weeks
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

Quick sanity check to confirm the results table is fully populated and that the blowout rate falls within a historically plausible range

In [6]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  AVG(blowout_flag) AS blowout_rate
FROM team_game_weeks_results
""").df()

Unnamed: 0,rows,blowout_rate
0,6782,0.34916


We compute rest days, short-week flags, and bye-week indicators by analyzing lagged game dates and week numbers

In [7]:
con.execute("""
CREATE OR REPLACE TABLE team_game_weeks_rest AS
WITH ordered AS (
  SELECT
    *,
    LAG(game_date) OVER (PARTITION BY season, team ORDER BY week) AS prev_game_date,
    LAG(week) OVER (PARTITION BY season, team ORDER BY week) AS prev_week
  FROM team_game_weeks_results
)
SELECT
  season,
  week,
  team,
  opponent,
  game_id,
  game_date,
  game_type,
  home_flag,
  away_flag,
  points_for,
  points_against,
  score_diff,
  win_flag,
  tie_flag,
  blowout_flag,
  stadium_id,
  stadium,
  location,
  roof,
  surface,
  temp,
  wind,
  div_game,
  DATE_DIFF('day', prev_game_date, game_date) AS days_rest,
  CASE WHEN prev_game_date IS NULL THEN NULL
       WHEN DATE_DIFF('day', prev_game_date, game_date) <= 4 THEN 1 ELSE 0 END AS short_week_flag,
  CASE WHEN prev_week IS NOT NULL AND week - prev_week > 1 THEN 1 ELSE 0 END AS bye_last_week_flag
FROM ordered
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

Quick sanity check to confirm that rest day calculations are logically bounded with no negative values or impossible outliers

In [8]:
con.execute("""
SELECT
  MIN(days_rest) AS min_days_rest,
  MAX(days_rest) AS max_days_rest,
  AVG(days_rest) AS avg_days_rest
FROM team_game_weeks_rest
WHERE days_rest IS NOT NULL
""").df()

Unnamed: 0,min_days_rest,max_days_rest,avg_days_rest
0,4,17,7.4612


We build offensive and defensive snap counts, along with offensive yards per play, by aggregating raw play-by-play data

In [9]:
con.execute("""
CREATE OR REPLACE TABLE team_week_basic_stats AS
WITH scrimmage AS (
  SELECT
    season,
    week,
    posteam AS team,
    yards_gained
  FROM pbp
  WHERE posteam IS NOT NULL
    AND play_type IN ('run','pass','sack','qb_kneel','qb_spike')
),
def_scrimmage AS (
  SELECT
    season,
    week,
    defteam AS team
  FROM pbp
  WHERE defteam IS NOT NULL
    AND play_type IN ('run','pass','sack','qb_kneel','qb_spike')
)
SELECT
  s.season,
  s.week,
  s.team,
  COUNT(*) AS offensive_snaps,
  SUM(COALESCE(s.yards_gained, 0)) * 1.0 / NULLIF(COUNT(*), 0) AS off_yards_per_play,
  (
    SELECT COUNT(*)
    FROM def_scrimmage d
    WHERE d.season = s.season AND d.week = s.week AND d.team = s.team
  ) AS defensive_snaps
FROM scrimmage s
GROUP BY 1,2,3
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

Quick sanity check to confirm that snap counts and yards per play were successfully computed and stay within realistic performance boundaries

In [10]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  MIN(off_yards_per_play) AS min_ypp,
  MAX(off_yards_per_play) AS max_ypp
FROM team_week_basic_stats
""").df()

Unnamed: 0,rows,min_ypp,max_ypp
0,7088,1.119048,10.304348


We map rostered players to their respective units, including offense, defense, or special teams

In [11]:
con.execute("""
CREATE OR REPLACE TABLE roster_player_side AS
SELECT
  season,
  week,
  team,
  gsis_id,
  COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) AS pos,
  CASE
    WHEN COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) IN ('QB','RB','FB','WR','TE','C','G','T','OL') THEN 'OFF'
    WHEN COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) IN ('DL','DE','DT','NT','LB','ILB','OLB','CB','S','FS','SS') THEN 'DEF'
    WHEN COALESCE(NULLIF(ngs_position, ''), NULLIF(position, '')) IN ('K','P','LS') THEN 'ST'
    ELSE 'OTHER'
  END AS side
FROM rosters_weekly
WHERE gsis_id IS NOT NULL
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

Quick sanity check to confirm that the roster-to-side mapping produced a sensible distribution across offensive, defensive, and special teams units

In [12]:
con.execute("""
SELECT side, COUNT(*) AS n
FROM roster_player_side
GROUP BY 1
ORDER BY n DESC
""").df()

Unnamed: 0,side,n
0,OFF,254630
1,OTHER,134044
2,DEF,131818
3,ST,26221


We build a distinct set of injured players per team-week by aggregating entries from the primary injuries table

In [13]:
con.execute("""
CREATE OR REPLACE TABLE injuries_team_week_players AS
SELECT DISTINCT
  season,
  week,
  team,
  gsis_id
FROM injuries
WHERE team IS NOT NULL
  AND gsis_id IS NOT NULL
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

We compute new injuries for the following week and aggregate them into separate offensive and defensive counts

In [14]:
con.execute("""
CREATE OR REPLACE TABLE new_injuries_next_week AS
WITH curr AS (
  SELECT season, week, team, gsis_id
  FROM injuries_team_week_players
),
nxt AS (
  SELECT season, week AS week_next, team, gsis_id
  FROM injuries_team_week_players
),
new_only AS (
  SELECT
    nxt.season,
    nxt.week_next,
    nxt.team,
    nxt.gsis_id
  FROM nxt
  LEFT JOIN curr
    ON curr.season = nxt.season
   AND curr.week = nxt.week_next - 1
   AND curr.team = nxt.team
   AND curr.gsis_id = nxt.gsis_id
  WHERE curr.gsis_id IS NULL
)
SELECT
  new_only.season,
  new_only.week_next - 1 AS week,
  new_only.team,
  SUM(CASE WHEN r.side = 'OFF' THEN 1 ELSE 0 END) AS inj_off_next,
  SUM(CASE WHEN r.side = 'DEF' THEN 1 ELSE 0 END) AS inj_def_next,
  SUM(CASE WHEN r.side IS NULL THEN 1 ELSE 0 END) AS inj_side_missing_next
FROM new_only
LEFT JOIN roster_player_side r
  ON r.season = new_only.season
 AND r.week = new_only.week_next
 AND r.team = new_only.team
 AND r.gsis_id = new_only.gsis_id
GROUP BY 1,2,3
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

Quick sanity check to confirm that next-week injury counts were successfully generated and that the distribution across roster sides is complete

In [15]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  AVG(inj_off_next) AS avg_off,
  AVG(inj_def_next) AS avg_def,
  AVG(inj_side_missing_next) AS avg_missing_side
FROM new_injuries_next_week
""").df()

Unnamed: 0,rows,avg_off,avg_def,avg_missing_side
0,6965,2.057717,1.0972,0.256281


We assemble the final analytic panel, merging situational rest factors and game-level results with play-by-play statistics and lead-week injury outcomes

In [16]:
con.execute("""
CREATE OR REPLACE TABLE team_week_panel AS
SELECT
  g.season,
  g.week,
  g.team,
  g.opponent,
  g.game_id,
  g.game_date,
  g.game_type,
  g.home_flag,
  g.away_flag,
  g.points_for,
  g.points_against,
  g.score_diff,
  g.win_flag,
  g.tie_flag,
  g.blowout_flag,
  g.stadium_id,
  g.stadium,
  g.location,
  g.roof,
  g.surface,
  g.temp,
  g.wind,
  g.div_game,
  g.days_rest,
  g.short_week_flag,
  g.bye_last_week_flag,
  b.offensive_snaps,
  b.defensive_snaps,
  b.off_yards_per_play,
  COALESCE(n.inj_off_next, 0) AS inj_off_next,
  COALESCE(n.inj_def_next, 0) AS inj_def_next,
  COALESCE(n.inj_side_missing_next, 0) AS inj_side_missing_next
FROM team_game_weeks_rest g
LEFT JOIN team_week_basic_stats b
  ON b.season = g.season AND b.week = g.week AND b.team = g.team
LEFT JOIN new_injuries_next_week n
  ON n.season = g.season AND n.week = g.week AND n.team = g.team
""")

<_duckdb.DuckDBPyConnection at 0x10627ef30>

Quick sanity check to confirm the final panel contains no duplicate team-week keys and to quantify any missing values introduced during the join process

In [17]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(*) - COUNT(DISTINCT season || '-' || week || '-' || team) AS duplicate_team_weeks,
  SUM(CASE WHEN offensive_snaps IS NULL THEN 1 ELSE 0 END) AS missing_off_snaps_rows,
  SUM(CASE WHEN defensive_snaps IS NULL THEN 1 ELSE 0 END) AS missing_def_snaps_rows
FROM team_week_panel
""").df()

Unnamed: 0,rows,duplicate_team_weeks,missing_off_snaps_rows,missing_def_snaps_rows
0,6782,0,272.0,272.0


Close the database

In [18]:
con.execute("CHECKPOINT")
con.close()