We initialize Python imports and opens a DuckDB connection that every later cell reuses. We also define small helper functions for schema checks, safe table overwrites, and key column names used throughout the notebook

In [1]:
from pathlib import Path
import duckdb

DB_DIR = Path("../db")
DB_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

TEAM_ABBR_COL = "team"
SEASON_COL = "season"
WEEK_COL = "week"

def _existing_cols(table_name: str) -> list[str]:
    df = con.execute(f"DESCRIBE {table_name}").df()
    return df["column_name"].tolist()

def _star_excluding(table_name: str, alias: str, exclude_cols: list[str]) -> str:
    cols = _existing_cols(table_name)
    keep = [c for c in cols if c not in set(exclude_cols)]
    return ",\n  ".join([f'{alias}."{c}"' for c in keep])

def _first_present(cols: list[str], options: list[str]) -> str | None:
    s = set([c.lower() for c in cols])
    for opt in options:
        if opt.lower() in s:
            for c in cols:
                if c.lower() == opt.lower():
                    return c
    return None

Quick sanity check to confirm that the final dataframe is loaded correctly and contains the primary dependent variables needed for regression which prevents the model from running on an incomplete or outdated dataset

In [2]:
required = [
    SEASON_COL,
    WEEK_COL,
    TEAM_ABBR_COL,
    "game_id",
    "points_for",
    "points_against",
    "Inj_Off_NextGame_w",
    "Inj_Def_NextGame_w",
]

cols_now = _existing_cols("team_week_panel")
missing = [c for c in required if c not in cols_now]

print("Missing required columns", missing)
print("OK" if not missing else "STOP, rerun earlier notebooks before step 10")

con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

Missing required columns []
OK


Unnamed: 0,rows,distinct_keys
0,6782,6782


We detect schedule-related columns and builds schedule-based controls, creating a dedicated table with game dates, home/away status, rest intervals, and "short week" or "bye" flags using only raw schedule data and team-week ordering

In [3]:
sched_cols = _existing_cols("schedules")

SCHED_GAME_ID = _first_present(sched_cols, ["game_id", "gsis_id"])
SCHED_SEASON = _first_present(sched_cols, ["season"])
SCHED_WEEK = _first_present(sched_cols, ["week"])
SCHED_HOME = _first_present(sched_cols, ["home_team", "home_team_abbr"])
SCHED_AWAY = _first_present(sched_cols, ["away_team", "away_team_abbr"])
SCHED_HOME_SCORE = _first_present(sched_cols, ["home_score", "home_points", "home_score_total"])
SCHED_AWAY_SCORE = _first_present(sched_cols, ["away_score", "away_points", "away_score_total"])
SCHED_DATE = _first_present(sched_cols, ["gameday", "game_date", "start_time", "game_datetime"])

need_sched = [SCHED_GAME_ID, SCHED_SEASON, SCHED_WEEK, SCHED_HOME, SCHED_AWAY, SCHED_DATE]
if any(x is None for x in need_sched):
    raise RuntimeError(f"Schedules schema missing required columns, detected, {need_sched}")

con.execute("DROP TABLE IF EXISTS team_week_schedule_controls")

con.execute(f"""
CREATE TABLE team_week_schedule_controls AS
WITH sched_base AS (
  SELECT
    CAST(s.{SCHED_SEASON} AS INTEGER) AS season,
    CAST(s.{SCHED_WEEK} AS INTEGER) AS week,
    CAST(s.{SCHED_GAME_ID} AS VARCHAR) AS game_id,
    CAST(s.{SCHED_HOME} AS VARCHAR) AS home_team,
    CAST(s.{SCHED_AWAY} AS VARCHAR) AS away_team,
    TRY_CAST(s.{SCHED_DATE} AS DATE) AS game_date
  FROM schedules s
),
team_rows AS (
  SELECT season, week, game_id, game_date, home_team AS team, away_team AS opponent, 1 AS home_flag_w
  FROM sched_base
  UNION ALL
  SELECT season, week, game_id, game_date, away_team AS team, home_team AS opponent, 0 AS home_flag_w
  FROM sched_base
),
with_prev AS (
  SELECT
    t.*,
    LAG(game_date) OVER (PARTITION BY season, team ORDER BY week) AS prev_game_date,
    LAG(week) OVER (PARTITION BY season, team ORDER BY week) AS prev_week
  FROM team_rows t
)
SELECT
  season,
  week,
  team,
  game_id,
  game_date,
  home_flag_w,
  CASE
    WHEN prev_game_date IS NULL OR game_date IS NULL THEN NULL
    ELSE date_diff('day', prev_game_date, game_date)
  END AS days_rest_w,
  CASE
    WHEN prev_game_date IS NULL OR game_date IS NULL THEN 0
    WHEN date_diff('day', prev_game_date, game_date) <= 4 THEN 1
    ELSE 0
  END AS short_week_flag_w,
  CASE
    WHEN prev_week IS NULL THEN 0
    WHEN week - prev_week > 1 THEN 1
    ELSE 0
  END AS bye_last_week_flag_w
FROM with_prev
""")

<_duckdb.DuckDBPyConnection at 0x1043728b0>

Quick sanity check to confirm that 'team_week_schedule_controls' has one row per season week team and no duplicates, which allows schedule controls to merge cleanly into 'team_week_panel'.

In [4]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_schedule_controls
""").df()

Unnamed: 0,rows,distinct_keys
0,7088,7088


Quick sanity check to confirm that 'days_rest_w' has sensible bounds and that short weeks and bye last week flags are being created at nonzero rates, which should catche schedule parsing errors early

In [5]:
con.execute("""
SELECT
  MIN(days_rest_w) AS min_rest,
  MAX(days_rest_w) AS max_rest,
  SUM(short_week_flag_w) AS n_short_weeks,
  SUM(bye_last_week_flag_w) AS n_bye_last_week
FROM team_week_schedule_controls
""").df()

Unnamed: 0,min_rest,max_rest,n_short_weeks,n_bye_last_week
0,4,17,412.0,458.0


Quick sanity check to confirm that every 'team_week_panel' row successfully matches to schedule controls

In [6]:
con.execute("""
SELECT
  COUNT(*) AS panel_rows,
  SUM(CASE WHEN s.team IS NULL THEN 1 ELSE 0 END) AS panel_rows_missing_schedule_controls
FROM team_week_panel p
LEFT JOIN team_week_schedule_controls s
  ON s.season = p.season
 AND s.week = p.week
 AND s.team = p.team
""").df()

Unnamed: 0,panel_rows,panel_rows_missing_schedule_controls
0,6782,0.0


Quick sanity check to confirm how many 'days_rest_w' values are null, which should mainly be the first game for each team in each season in order to help confirm the lag logic is behaving as intended

In [7]:
con.execute("""
SELECT
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS n_null_days_rest,
  COUNT(*) AS rows_total
FROM team_week_schedule_controls
""").df()

Unnamed: 0,n_null_days_rest,rows_total
0,416.0,7088


We build play-derived controls for offensive snaps, defensive snaps, and offensive yards per play by aggregating play-by-play data to create the specific weekly metrics that account for game-level intensity and efficiency

In [8]:
pbp_cols = _existing_cols("pbp")

PBP_GAME_ID = _first_present(pbp_cols, ["game_id", "gsis_id"])
PBP_POSTEAM = _first_present(pbp_cols, ["posteam", "pos_team"])
PBP_DEFTEAM = _first_present(pbp_cols, ["defteam", "def_team"])
PBP_PLAY_TYPE = _first_present(pbp_cols, ["play_type", "play_type_nfl"])
PBP_YARDS = _first_present(pbp_cols, ["yards_gained", "yards", "ydsnet"])
PBP_NO_PLAY = _first_present(pbp_cols, ["no_play", "noplay"])

need_pbp = [PBP_GAME_ID, PBP_POSTEAM, PBP_DEFTEAM, PBP_PLAY_TYPE, PBP_YARDS]
if any(x is None for x in need_pbp):
    raise RuntimeError(f"PBP schema missing required columns, detected, {need_pbp}")

no_play_expr = "0"
if PBP_NO_PLAY is not None:
    no_play_expr = f"COALESCE(CAST({PBP_NO_PLAY} AS INTEGER), 0)"

con.execute("DROP TABLE IF EXISTS team_week_play_controls")

con.execute(f"""
CREATE TABLE team_week_play_controls AS
WITH panel_keys AS (
  SELECT season, week, team, game_id
  FROM team_week_panel
),
scrimmage AS (
  SELECT
    CAST({PBP_GAME_ID} AS VARCHAR) AS game_id,
    CAST({PBP_POSTEAM} AS VARCHAR) AS posteam,
    CAST({PBP_DEFTEAM} AS VARCHAR) AS defteam,
    LOWER(CAST({PBP_PLAY_TYPE} AS VARCHAR)) AS play_type,
    CAST({PBP_YARDS} AS DOUBLE) AS yards_gained,
    {no_play_expr} AS no_play
  FROM pbp
  WHERE {PBP_GAME_ID} IS NOT NULL
    AND {PBP_POSTEAM} IS NOT NULL
    AND {PBP_DEFTEAM} IS NOT NULL
),
scrimmage_filtered AS (
  SELECT *
  FROM scrimmage
  WHERE no_play = 0
    AND play_type IN ('run','pass','qb_kneel','qb_spike')
),
offense AS (
  SELECT
    k.season,
    k.week,
    k.team,
    COUNT(*) AS offensive_snaps_w,
    SUM(COALESCE(s.yards_gained, 0)) AS off_yards_w
  FROM panel_keys k
  JOIN scrimmage_filtered s
    ON s.game_id = k.game_id
   AND s.posteam = k.team
  GROUP BY 1,2,3
),
defense AS (
  SELECT
    k.season,
    k.week,
    k.team,
    COUNT(*) AS defensive_snaps_w
  FROM panel_keys k
  JOIN scrimmage_filtered s
    ON s.game_id = k.game_id
   AND s.defteam = k.team
  GROUP BY 1,2,3
)
SELECT
  k.season,
  k.week,
  k.team,
  COALESCE(o.offensive_snaps_w, 0) AS offensive_snaps_w,
  COALESCE(d.defensive_snaps_w, 0) AS defensive_snaps_w,
  CASE
    WHEN COALESCE(o.offensive_snaps_w, 0) > 0 THEN COALESCE(o.off_yards_w, 0) / o.offensive_snaps_w
    ELSE NULL
  END AS off_yards_per_play_w
FROM panel_keys k
LEFT JOIN offense o
  ON o.season = k.season
 AND o.week = k.week
 AND o.team = k.team
LEFT JOIN defense d
  ON d.season = k.season
 AND d.week = k.week
 AND d.team = k.team
""")

<_duckdb.DuckDBPyConnection at 0x1043728b0>

Quick sanity check to confirm that the rebuilt play controls now cover every row in the team week panel

In [9]:
con.execute("""
SELECT
  COUNT(*) AS panel_rows,
  SUM(CASE WHEN pc.team IS NULL THEN 1 ELSE 0 END) AS panel_rows_missing_play_controls
FROM team_week_panel p
LEFT JOIN team_week_play_controls pc
  ON pc.season = p.season
 AND pc.week = p.week
 AND pc.team = p.team
""").df()

Unnamed: 0,panel_rows,panel_rows_missing_play_controls
0,6782,0.0


Quick sanity check to confirm that 'team_week_play_controls' has unique season week team keys so that it can be safely merged into 'team_week_panel' without duplicating rows

In [10]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_play_controls
""").df()

Unnamed: 0,rows,distinct_keys
0,6782,6782


Quick sanity check to confirm that offensive and defensive snap counts are populated and within plausible ranges

In [11]:
con.execute("""
SELECT
  MIN(offensive_snaps_w) AS min_off_snaps,
  MAX(offensive_snaps_w) AS max_off_snaps,
  MIN(defensive_snaps_w) AS min_def_snaps,
  MAX(defensive_snaps_w) AS max_def_snaps,
  MIN(off_yards_per_play_w) AS min_off_ypp,
  MAX(off_yards_per_play_w) AS max_off_ypp
FROM team_week_play_controls
""").df()

Unnamed: 0,min_off_snaps,max_off_snaps,min_def_snaps,max_def_snaps,min_off_ypp,max_off_ypp
0,0,95,0,95,1.119048,10.304348


Quick sanity check to confirm that 'every team_week_panel' row matches to play controls

In [12]:
con.execute("""
SELECT
  COUNT(*) AS panel_rows,
  SUM(CASE WHEN pc.team IS NULL THEN 1 ELSE 0 END) AS panel_rows_missing_play_controls
FROM team_week_panel p
LEFT JOIN team_week_play_controls pc
  ON pc.season = p.season
 AND pc.week = p.week
 AND pc.team = p.team
""").df()

Unnamed: 0,panel_rows,panel_rows_missing_play_controls
0,6782,0.0


In [13]:
con.execute("DROP TABLE IF EXISTS team_week_score_controls")

con.execute("""
CREATE TABLE team_week_score_controls AS
SELECT
  season,
  week,
  team,
  points_for,
  points_against,
  points_for - points_against AS score_diff_w,
  CASE WHEN ABS(points_for - points_against) >= 14 THEN 1 ELSE 0 END AS blowout_flag_w
FROM team_week_panel
""")

exclude = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "off_yards_per_play_w",
    "score_diff_w",
    "blowout_flag_w",
    "days_rest_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
]

star = _star_excluding("team_week_panel", "p", exclude)

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
SELECT
  {star},

  pc.offensive_snaps_w,
  pc.defensive_snaps_w,
  pc.off_yards_per_play_w,

  sc.score_diff_w,
  sc.blowout_flag_w,

  sch.days_rest_w,
  sch.short_week_flag_w,
  sch.bye_last_week_flag_w,
  sch.home_flag_w

FROM team_week_panel p
LEFT JOIN team_week_play_controls pc
  ON pc.season = p.season
 AND pc.week = p.week
 AND pc.team = p.team
LEFT JOIN team_week_score_controls sc
  ON sc.season = p.season
 AND sc.week = p.week
 AND sc.team = p.team
LEFT JOIN team_week_schedule_controls sch
  ON sch.season = p.season
 AND sch.week = p.week
 AND sch.team = p.team
""")

<_duckdb.DuckDBPyConnection at 0x1043728b0>