We initialize Python imports and opens a DuckDB connection that every later cell reuses. We also define small helper functions for schema checks, safe table overwrites, and key column names used throughout the notebook

In [1]:
from pathlib import Path
import duckdb

DB_DIR = Path("../db")
DB_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

TEAM_ABBR_COL = "team"
SEASON_COL = "season"
WEEK_COL = "week"

def _existing_cols(table_name: str) -> list[str]:
    df = con.execute(f"DESCRIBE {table_name}").df()
    return df["column_name"].tolist()

def _star_excluding(table_name: str, alias: str, exclude_cols: list[str]) -> str:
    cols = _existing_cols(table_name)
    keep = [c for c in cols if c not in set(exclude_cols)]
    return ",\n  ".join([f'{alias}."{c}"' for c in keep])

def _first_present(cols: list[str], options: list[str]) -> str | None:
    s = set([c.lower() for c in cols])
    for opt in options:
        if opt.lower() in s:
            for c in cols:
                if c.lower() == opt.lower():
                    return c
    return None

Quick sanity check to confirm that the final dataframe is loaded correctly and contains the primary dependent variables needed for regression which prevents the model from running on an incomplete or outdated dataset

In [2]:
required = [
    SEASON_COL,
    WEEK_COL,
    TEAM_ABBR_COL,
    "game_id",
    "points_for",
    "points_against",
    "Inj_Off_NextGame_w",
    "Inj_Def_NextGame_w",
]

cols_now = _existing_cols("team_week_panel")
missing = [c for c in required if c not in cols_now]

print("Missing required columns", missing)
print("OK" if not missing else "STOP, rerun earlier notebooks before step 10")

con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

Missing required columns []
OK


Unnamed: 0,rows,distinct_keys
0,6782,6782


We detect schedule-related columns and builds schedule-based controls, creating a dedicated table with game dates, home/away status, rest intervals, and "short week" or "bye" flags using only raw schedule data and team-week ordering

In [3]:
sched_cols = _existing_cols("schedules")

SCHED_GAME_ID = _first_present(sched_cols, ["game_id", "gsis_id"])
SCHED_SEASON = _first_present(sched_cols, ["season"])
SCHED_WEEK = _first_present(sched_cols, ["week"])
SCHED_HOME = _first_present(sched_cols, ["home_team", "home_team_abbr"])
SCHED_AWAY = _first_present(sched_cols, ["away_team", "away_team_abbr"])
SCHED_DATE = _first_present(sched_cols, ["gameday", "game_date", "start_time", "game_datetime"])
SCHED_GAME_TYPE = _first_present(sched_cols, ["game_type", "season_type"])

need_sched = [SCHED_GAME_ID, SCHED_SEASON, SCHED_WEEK, SCHED_HOME, SCHED_AWAY, SCHED_DATE]
if any(x is None for x in need_sched):
    raise RuntimeError(f"Schedules schema missing required columns, detected, {need_sched}")

game_type_filter = ""
if SCHED_GAME_TYPE is not None:
    game_type_filter = f"WHERE LOWER(CAST(s.{SCHED_GAME_TYPE} AS VARCHAR)) IN ('reg','regular')"

con.execute("DROP TABLE IF EXISTS team_week_schedule_controls")

con.execute(f"""
CREATE TABLE team_week_schedule_controls AS
WITH sched_base AS (
  SELECT
    CAST(s.{SCHED_SEASON} AS INTEGER) AS season,
    CAST(s.{SCHED_WEEK} AS INTEGER) AS week,
    CAST(s.{SCHED_GAME_ID} AS VARCHAR) AS game_id,
    CAST(s.{SCHED_HOME} AS VARCHAR) AS home_team,
    CAST(s.{SCHED_AWAY} AS VARCHAR) AS away_team,
    COALESCE(
      TRY_CAST(s.{SCHED_DATE} AS DATE),
      CAST(TRY_CAST(s.{SCHED_DATE} AS TIMESTAMP) AS DATE)
    ) AS game_date
  FROM schedules s
  {game_type_filter}
),
team_rows AS (
  SELECT season, week, game_id, game_date, home_team AS team, away_team AS opponent, 1 AS home_flag_w
  FROM sched_base
  UNION ALL
  SELECT season, week, game_id, game_date, away_team AS team, home_team AS opponent, 0 AS home_flag_w
  FROM sched_base
),
with_prev AS (
  SELECT
    t.*,
    LAG(game_date) OVER (PARTITION BY season, team ORDER BY week) AS prev_game_date,
    LAG(week) OVER (PARTITION BY season, team ORDER BY week) AS prev_week
  FROM team_rows t
)
SELECT
  season,
  week,
  team,
  game_id,
  game_date,
  home_flag_w,
  CASE WHEN prev_game_date IS NULL THEN 1 ELSE 0 END AS first_game_flag_w,
  CASE
    WHEN prev_game_date IS NULL THEN 7
    WHEN prev_game_date IS NULL OR game_date IS NULL THEN NULL
    ELSE date_diff('day', prev_game_date, game_date)
  END AS days_rest_w,
  CASE
    WHEN prev_game_date IS NULL OR game_date IS NULL THEN 0
    WHEN date_diff('day', prev_game_date, game_date) <= 4 THEN 1
    ELSE 0
  END AS short_week_flag_w,
  CASE
    WHEN prev_week IS NULL THEN 0
    WHEN week - prev_week > 1 THEN 1
    ELSE 0
  END AS bye_last_week_flag_w
FROM with_prev
""")

<_duckdb.DuckDBPyConnection at 0x10725b7b0>

Quick sanity check to confirm that 'team_week_schedule_controls' has one row per season week team and no duplicates, which allows schedule controls to merge cleanly into 'team_week_panel'.

In [4]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_schedule_controls
""").df()

Unnamed: 0,rows,distinct_keys
0,6782,6782


Quick sanity check to confirm that 'days_rest_w' has sensible bounds and that short weeks and bye last week flags are being created at nonzero rates, which should catche schedule parsing errors early

In [5]:
con.execute("""
SELECT
  MIN(days_rest_w) AS min_rest,
  MAX(days_rest_w) AS max_rest,
  SUM(short_week_flag_w) AS n_short_weeks,
  SUM(bye_last_week_flag_w) AS n_bye_last_week
FROM team_week_schedule_controls
""").df()

Unnamed: 0,min_rest,max_rest,n_short_weeks,n_bye_last_week
0,4,17,412.0,416.0


Quick sanity check to confirm that every 'team_week_panel' row successfully matches to schedule controls

In [6]:
con.execute("""
SELECT
  COUNT(*) AS panel_rows,
  SUM(CASE WHEN s.team IS NULL THEN 1 ELSE 0 END) AS panel_rows_missing_schedule_controls
FROM team_week_panel p
LEFT JOIN team_week_schedule_controls s
  ON s.season = p.season
 AND s.week = p.week
 AND s.team = p.team
""").df()

Unnamed: 0,panel_rows,panel_rows_missing_schedule_controls
0,6782,0.0


Quick sanity check to confirm how many 'days_rest_w' values are null, which should mainly be the first game for each team in each season in order to help confirm the lag logic is behaving as intended

In [7]:
con.execute("""
SELECT
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS n_null_days_rest,
  COUNT(*) AS rows_total
FROM team_week_schedule_controls
""").df()

Unnamed: 0,n_null_days_rest,rows_total
0,0.0,6782


We build play-derived controls for offensive snaps, defensive snaps, and offensive yards per play by aggregating play-by-play data to create the specific weekly metrics that account for game-level intensity and efficiency

In [8]:
pbp_cols = _existing_cols("pbp")

PBP_GAME_ID = _first_present(pbp_cols, ["game_id", "gsis_id"])
PBP_POSTEAM = _first_present(pbp_cols, ["posteam", "pos_team"])
PBP_DEFTEAM = _first_present(pbp_cols, ["defteam", "def_team"])
PBP_PLAY_TYPE = _first_present(pbp_cols, ["play_type", "play_type_nfl"])
PBP_YARDS = _first_present(pbp_cols, ["yards_gained", "yards", "ydsnet"])
PBP_NO_PLAY = _first_present(pbp_cols, ["no_play", "noplay"])

need_pbp = [PBP_GAME_ID, PBP_POSTEAM, PBP_DEFTEAM, PBP_PLAY_TYPE, PBP_YARDS]
if any(x is None for x in need_pbp):
    raise RuntimeError(f"PBP schema missing required columns, detected, {need_pbp}")

no_play_expr = "0"
if PBP_NO_PLAY is not None:
    no_play_expr = f"COALESCE(CAST({PBP_NO_PLAY} AS INTEGER), 0)"

con.execute("DROP TABLE IF EXISTS team_week_play_controls")

con.execute(f"""
CREATE TABLE team_week_play_controls AS
WITH team_code_map AS (
  SELECT 'LAC' AS pbp_code, 'SD' AS panel_code, 0 AS start_season, 2016 AS end_season
  UNION ALL SELECT 'LV', 'OAK', 0, 2019
  UNION ALL SELECT 'LAR', 'STL', 0, 2015
  UNION ALL SELECT 'LA', 'STL', 0, 2015
  UNION ALL SELECT 'LAR', 'LA', 2016, 9999
  UNION ALL SELECT 'JAC', 'JAX', 0, 9999
  UNION ALL SELECT 'WSH', 'WAS', 0, 9999
  UNION ALL SELECT 'TAM', 'TB', 0, 9999
  UNION ALL SELECT 'NWE', 'NE', 0, 9999
  UNION ALL SELECT 'GNB', 'GB', 0, 9999
  UNION ALL SELECT 'SFO', 'SF', 0, 9999
  UNION ALL SELECT 'KAN', 'KC', 0, 9999
  UNION ALL SELECT 'NOR', 'NO', 0, 9999
),
panel_keys AS (
  SELECT season, week, team, game_id
  FROM team_week_panel
),
scrimmage AS (
  SELECT
    CAST({PBP_GAME_ID} AS VARCHAR) AS game_id,
    CAST({PBP_POSTEAM} AS VARCHAR) AS posteam_raw,
    CAST({PBP_DEFTEAM} AS VARCHAR) AS defteam_raw,
    LOWER(CAST({PBP_PLAY_TYPE} AS VARCHAR)) AS play_type,
    CAST({PBP_YARDS} AS DOUBLE) AS yards_gained,
    {no_play_expr} AS no_play,
    TRY_CAST(SUBSTR(CAST({PBP_GAME_ID} AS VARCHAR), 1, 4) AS INTEGER) AS season_from_game_id
  FROM pbp
  WHERE {PBP_GAME_ID} IS NOT NULL
    AND {PBP_POSTEAM} IS NOT NULL
    AND {PBP_DEFTEAM} IS NOT NULL
),
scrimmage_mapped AS (
  SELECT
    s.game_id,
    s.play_type,
    s.yards_gained,
    s.no_play,
    COALESCE(m1.panel_code, s.posteam_raw) AS posteam,
    COALESCE(m2.panel_code, s.defteam_raw) AS defteam
  FROM scrimmage s
  LEFT JOIN team_code_map m1
    ON m1.pbp_code = s.posteam_raw
   AND s.season_from_game_id BETWEEN m1.start_season AND m1.end_season
  LEFT JOIN team_code_map m2
    ON m2.pbp_code = s.defteam_raw
   AND s.season_from_game_id BETWEEN m2.start_season AND m2.end_season
),
scrimmage_filtered AS (
  SELECT *
  FROM scrimmage_mapped
  WHERE play_type IN ('run','pass','sack','qb_kneel','qb_spike')
),
offense AS (
  SELECT
    k.season,
    k.week,
    k.team,
    SUM(CASE WHEN s.no_play = 0 THEN 1 ELSE 0 END) AS offensive_snaps_w,
    SUM(CASE WHEN s.no_play = 1 THEN 1 ELSE 0 END) AS offensive_no_play_snaps_w,
    SUM(CASE WHEN s.no_play = 0 AND s.play_type IN ('run','pass','sack') THEN COALESCE(s.yards_gained, 0) ELSE 0 END) AS off_yards_eff_w,
    SUM(CASE WHEN s.no_play = 0 AND s.play_type IN ('run','pass','sack') THEN 1 ELSE 0 END) AS off_snaps_eff_w
  FROM panel_keys k
  JOIN scrimmage_filtered s
    ON s.game_id = k.game_id
   AND s.posteam = k.team
  GROUP BY 1,2,3
),
defense AS (
  SELECT
    k.season,
    k.week,
    k.team,
    SUM(CASE WHEN s.no_play = 0 THEN 1 ELSE 0 END) AS defensive_snaps_w,
    SUM(CASE WHEN s.no_play = 1 THEN 1 ELSE 0 END) AS defensive_no_play_snaps_w
  FROM panel_keys k
  JOIN scrimmage_filtered s
    ON s.game_id = k.game_id
   AND s.defteam = k.team
  GROUP BY 1,2,3
)
SELECT
  k.season,
  k.week,
  k.team,
  COALESCE(o.offensive_snaps_w, 0) AS offensive_snaps_w,
  COALESCE(d.defensive_snaps_w, 0) AS defensive_snaps_w,
  COALESCE(o.offensive_no_play_snaps_w, 0) AS offensive_no_play_snaps_w,
  COALESCE(d.defensive_no_play_snaps_w, 0) AS defensive_no_play_snaps_w,
  CASE
    WHEN COALESCE(o.off_snaps_eff_w, 0) > 0 THEN COALESCE(o.off_yards_eff_w, 0) / o.off_snaps_eff_w
    ELSE NULL
  END AS off_yards_per_play_w
FROM panel_keys k
LEFT JOIN offense o
  ON o.season = k.season
 AND o.week = k.week
 AND o.team = k.team
LEFT JOIN defense d
  ON d.season = k.season
 AND d.week = k.week
 AND d.team = k.team
""")

<_duckdb.DuckDBPyConnection at 0x10725b7b0>

Quick sanity check to confirm that schedule controls keys are unique, panel join coverage is complete, and rest day fields look reasonable

In [9]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_schedule_controls
""").df()

con.execute("""
SELECT
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS n_null_days_rest,
  SUM(CASE WHEN first_game_flag_w = 1 THEN 1 ELSE 0 END) AS n_first_games,
  MIN(days_rest_w) AS min_rest,
  MAX(days_rest_w) AS max_rest,
  SUM(short_week_flag_w) AS n_short_weeks,
  SUM(bye_last_week_flag_w) AS n_bye_last_week
FROM team_week_schedule_controls
""").df()

con.execute("""
SELECT
  COUNT(*) AS panel_rows,
  SUM(CASE WHEN s.team IS NULL THEN 1 ELSE 0 END) AS panel_rows_missing_schedule_controls
FROM team_week_panel p
LEFT JOIN team_week_schedule_controls s
  ON s.season = p.season
 AND s.week = p.week
 AND s.team = p.team
""").df()

con.execute("""
SELECT
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS n_null_days_rest,
  SUM(first_game_flag_w) AS n_first_games,
  COUNT(*) AS rows_total
FROM team_week_schedule_controls
""").df()

Unnamed: 0,n_null_days_rest,n_first_games,rows_total
0,0.0,416.0,6782


Quick sanity check to confirm that 'team_week_play_controls' has unique season week team keys so that it can be safely merged into 'team_week_panel' without duplicating rows

In [10]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_play_controls
""").df()

Unnamed: 0,rows,distinct_keys
0,6782,6782


Quick sanity check to confirm that offensive and defensive snap counts are populated and within plausible ranges

In [11]:
con.execute("""
SELECT
  MIN(offensive_snaps_w) AS min_off_snaps,
  MAX(offensive_snaps_w) AS max_off_snaps,
  MIN(defensive_snaps_w) AS min_def_snaps,
  MAX(defensive_snaps_w) AS max_def_snaps,
  MIN(off_yards_per_play_w) AS min_off_ypp,
  MAX(off_yards_per_play_w) AS max_off_ypp
FROM team_week_play_controls
""").df()

Unnamed: 0,min_off_snaps,max_off_snaps,min_def_snaps,max_def_snaps,min_off_ypp,max_off_ypp
0,33.0,95.0,33.0,95.0,1.119048,10.882353


Quick sanity check to confirm that 'every team_week_panel' row matches to play controls

In [12]:
con.execute("""
SELECT
  COUNT(*) AS panel_rows,
  SUM(CASE WHEN pc.team IS NULL THEN 1 ELSE 0 END) AS panel_rows_missing_play_controls
FROM team_week_panel p
LEFT JOIN team_week_play_controls pc
  ON pc.season = p.season
 AND pc.week = p.week
 AND pc.team = p.team
""").df()

Unnamed: 0,panel_rows,panel_rows_missing_play_controls
0,6782,0.0


Quick sanity check to confirm if the mapping fix actually removed the zero snap rows

In [13]:
con.execute("""
SELECT
  SUM(CASE WHEN offensive_snaps_w = 0 THEN 1 ELSE 0 END) AS n_zero_off_snaps,
  SUM(CASE WHEN off_yards_per_play_w IS NULL THEN 1 ELSE 0 END) AS n_null_off_ypp
FROM team_week_play_controls
""").df()

Unnamed: 0,n_zero_off_snaps,n_null_off_ypp
0,0.0,0.0


Quick sanity check to confirm that the panel now has zero null off yards per play unless a game truly has no valid scrimmage plays, which should be essentially none

In [14]:
con.execute("""
SELECT
  SUM(CASE WHEN offensive_snaps = 0 THEN 1 ELSE 0 END) AS n_zero_off_snaps,
  SUM(CASE WHEN off_yards_per_play IS NULL THEN 1 ELSE 0 END) AS n_null_off_ypp
FROM team_week_panel
""").df()

Unnamed: 0,n_zero_off_snaps,n_null_off_ypp
0,0.0,272.0


We calculate game-script indicators from final scores and combine them with our existing schedule and play-by-play metrics to form a comprehensive suite of control variables for each team-week

In [15]:
con.execute("DROP TABLE IF EXISTS team_week_score_controls")

con.execute("""
CREATE TABLE team_week_score_controls AS
SELECT
  season,
  week,
  team,
  points_for,
  points_against,
  points_for - points_against AS score_diff_w,
  CASE WHEN ABS(points_for - points_against) >= 14 THEN 1 ELSE 0 END AS blowout_flag_w
FROM team_week_panel
""")

exclude = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "off_yards_per_play_w",
    "offensive_no_play_snaps_w",
    "defensive_no_play_snaps_w",
    "score_diff_w",
    "blowout_flag_w",
    "days_rest_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "first_game_flag_w",
]

star = _star_excluding("team_week_panel", "p", exclude)

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
SELECT
  {star},

  pc.offensive_snaps_w,
  pc.defensive_snaps_w,
  pc.off_yards_per_play_w,
  pc.offensive_no_play_snaps_w,
  pc.defensive_no_play_snaps_w,

  sc.score_diff_w,
  sc.blowout_flag_w,

  sch.days_rest_w,
  sch.short_week_flag_w,
  sch.bye_last_week_flag_w,
  sch.home_flag_w,
  sch.first_game_flag_w

FROM team_week_panel p
LEFT JOIN team_week_play_controls pc
  ON pc.season = p.season
 AND pc.week = p.week
 AND pc.team = p.team
LEFT JOIN team_week_score_controls sc
  ON sc.season = p.season
 AND sc.week = p.week
 AND sc.team = p.team
LEFT JOIN team_week_schedule_controls sch
  ON sch.season = p.season
 AND sch.week = p.week
 AND sch.team = p.team
""")

<_duckdb.DuckDBPyConnection at 0x10725b7b0>

Quick sanity check to confirm that no duplicate season week team rows were created during the merge

In [16]:
con.execute("""
SELECT
  COUNT(*) AS rows_zero_off_snaps,
  MIN(season) AS min_season,
  MAX(season) AS max_season
FROM team_week_panel
WHERE offensive_snaps_w = 0
""").df()

con.execute("""
SELECT
  team,
  COUNT(*) AS n
FROM team_week_panel
WHERE offensive_snaps_w = 0
GROUP BY 1
ORDER BY n DESC
""").df()

con.execute("""
SELECT
  season,
  team,
  COUNT(*) AS n
FROM team_week_panel
WHERE offensive_snaps_w = 0
GROUP BY 1,2
ORDER BY 1,3 DESC
LIMIT 60
""").df()

Unnamed: 0,season,team,n


Quick sanity check to confirm for unexpected nulls in the newly merged control columns by flagging any join fails or if any variable is not being populated as expected

In [17]:
con.execute("""
SELECT
  SUM(CASE WHEN offensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_off_snaps,
  SUM(CASE WHEN defensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_def_snaps,
  SUM(CASE WHEN off_yards_per_play_w IS NULL THEN 1 ELSE 0 END) AS null_off_ypp,
  SUM(CASE WHEN score_diff_w IS NULL THEN 1 ELSE 0 END) AS null_score_diff,
  SUM(CASE WHEN blowout_flag_w IS NULL THEN 1 ELSE 0 END) AS null_blowout,
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS null_days_rest,
  SUM(CASE WHEN home_flag_w IS NULL THEN 1 ELSE 0 END) AS null_home
FROM team_week_panel
""").df()

Unnamed: 0,null_off_snaps,null_def_snaps,null_off_ypp,null_score_diff,null_blowout,null_days_rest,null_home
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Quick sanity check to identify exactly which teams and seasons still have zero offensive snaps so we know what mapping to add next

In [18]:
con.execute("""
SELECT
  season,
  team,
  COUNT(*) AS n
FROM team_week_play_controls
WHERE offensive_snaps_w = 0
GROUP BY 1,2
ORDER BY 1,3 DESC
""").df()

con.execute("""
SELECT
  team,
  COUNT(*) AS n
FROM team_week_play_controls
WHERE offensive_snaps_w = 0
GROUP BY 1
ORDER BY n DESC
""").df()

Unnamed: 0,team,n


Quick sanity check to confirm injury control columns are present for the next game outcome model by verifying that 'Inj_Off_LastGame_w' and 'Inj_Def_LastGame_w' exist and also leaves 'Inj_Off_Last_w' and 'Inj_Def_Last_w' intact for the 'w+1' outcome

In [19]:
cols_now = _existing_cols("team_week_panel")

need_injury_controls = ["Inj_Off_LastGame_w", "Inj_Def_LastGame_w"]
missing = [c for c in need_injury_controls if c not in cols_now]

print("Missing next game injury lag controls", missing)
print("OK" if not missing else "STOP, rerun the next game injury outcome block in notebook 09")

con.execute("""
SELECT
  SUM(CASE WHEN Inj_Off_LastGame_w IS NULL THEN 1 ELSE 0 END) AS null_inj_off_last_game,
  SUM(CASE WHEN Inj_Def_LastGame_w IS NULL THEN 1 ELSE 0 END) AS null_inj_def_last_game
FROM team_week_panel
""").df()

Missing next game injury lag controls []
OK


Unnamed: 0,null_inj_off_last_game,null_inj_def_last_game
0,0.0,0.0


Quick sanity check to confirm the final integrity of the dataset before it enters the regression phase to ensure that the team-week keys are unique and that the essential control variables are fully populated for the upcoming statistical analysis

In [20]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

con.execute("""
SELECT
  SUM(CASE WHEN offensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_off_snaps,
  SUM(CASE WHEN defensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_def_snaps,
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS null_days_rest,
  SUM(CASE WHEN home_flag_w IS NULL THEN 1 ELSE 0 END) AS null_home_flag,
  SUM(CASE WHEN score_diff_w IS NULL THEN 1 ELSE 0 END) AS null_score_diff,
  SUM(CASE WHEN blowout_flag_w IS NULL THEN 1 ELSE 0 END) AS null_blowout
FROM team_week_panel
""").df()

con.execute("""
SELECT
  COUNT(*) AS rows_total,
  SUM(CASE WHEN Inj_Off_NextGame_w IS NULL THEN 1 ELSE 0 END) AS null_off_next_game,
  SUM(CASE WHEN Inj_Def_NextGame_w IS NULL THEN 1 ELSE 0 END) AS null_def_next_game,
  COUNT(*) - SUM(CASE WHEN Inj_Off_NextGame_w IS NULL THEN 1 ELSE 0 END) AS rows_model_sample
FROM team_week_panel
""").df()

Unnamed: 0,rows_total,null_off_next_game,null_def_next_game,rows_model_sample
0,6782,416.0,416.0,6366.0


Quick sanity check to confirm that play controls have one row per team week, fully cover the panel keys, and have no unexpected zeros or nulls

In [21]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_play_controls
""").df()

con.execute("""
SELECT
  COUNT(*) AS panel_rows,
  SUM(CASE WHEN pc.team IS NULL THEN 1 ELSE 0 END) AS panel_rows_missing_play_controls
FROM team_week_panel p
LEFT JOIN team_week_play_controls pc
  ON pc.season = p.season
 AND pc.week = p.week
 AND pc.team = p.team
""").df()

con.execute("""
SELECT
  SUM(CASE WHEN offensive_snaps_w = 0 THEN 1 ELSE 0 END) AS n_zero_off_snaps,
  SUM(CASE WHEN defensive_snaps_w = 0 THEN 1 ELSE 0 END) AS n_zero_def_snaps,
  SUM(CASE WHEN off_yards_per_play_w IS NULL THEN 1 ELSE 0 END) AS n_null_off_ypp,
  SUM(CASE WHEN offensive_no_play_snaps_w > 0 THEN 1 ELSE 0 END) AS n_weeks_with_off_no_play,
  SUM(CASE WHEN defensive_no_play_snaps_w > 0 THEN 1 ELSE 0 END) AS n_weeks_with_def_no_play
FROM team_week_play_controls
""").df()

con.execute(f"""
SELECT
  COUNT(*) AS rows,
  SUM(CASE WHEN season_from_game_id IS NULL THEN 1 ELSE 0 END) AS n_null_season_from_game_id
FROM (
  SELECT
    TRY_CAST(SUBSTR(CAST({PBP_GAME_ID} AS VARCHAR), 1, 4) AS INTEGER) AS season_from_game_id
  FROM pbp
  WHERE {PBP_GAME_ID} IS NOT NULL
)
""").df()

Unnamed: 0,rows,n_null_season_from_game_id
0,627226,0.0


Quick sanity check to confirm that panel keys stayed unique and all control columns are present with expected null patterns

In [22]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

con.execute("""
SELECT
  SUM(CASE WHEN offensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_off_snaps,
  SUM(CASE WHEN defensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_def_snaps,
  SUM(CASE WHEN off_yards_per_play_w IS NULL THEN 1 ELSE 0 END) AS null_off_ypp,
  SUM(CASE WHEN offensive_no_play_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_off_no_play,
  SUM(CASE WHEN defensive_no_play_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_def_no_play,
  SUM(CASE WHEN score_diff_w IS NULL THEN 1 ELSE 0 END) AS null_score_diff,
  SUM(CASE WHEN blowout_flag_w IS NULL THEN 1 ELSE 0 END) AS null_blowout,
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS null_days_rest,
  SUM(CASE WHEN short_week_flag_w IS NULL THEN 1 ELSE 0 END) AS null_short_week,
  SUM(CASE WHEN bye_last_week_flag_w IS NULL THEN 1 ELSE 0 END) AS null_bye_last_week,
  SUM(CASE WHEN home_flag_w IS NULL THEN 1 ELSE 0 END) AS null_home,
  SUM(CASE WHEN first_game_flag_w IS NULL THEN 1 ELSE 0 END) AS null_first_game
FROM team_week_panel
""").df()

Unnamed: 0,null_off_snaps,null_def_snaps,null_off_ypp,null_off_no_play,null_def_no_play,null_score_diff,null_blowout,null_days_rest,null_short_week,null_bye_last_week,null_home,null_first_game
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Quick sanity check to confirm that every observation destined for the regression model is "feature-complete," which means that no critical control variables are missing

In [23]:
con.execute("""
SELECT
  COUNT(*) AS rows_model_sample,
  SUM(CASE WHEN offensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_off_snaps,
  SUM(CASE WHEN defensive_snaps_w IS NULL THEN 1 ELSE 0 END) AS null_def_snaps,
  SUM(CASE WHEN off_yards_per_play_w IS NULL THEN 1 ELSE 0 END) AS null_off_ypp,
  SUM(CASE WHEN score_diff_w IS NULL THEN 1 ELSE 0 END) AS null_score_diff,
  SUM(CASE WHEN blowout_flag_w IS NULL THEN 1 ELSE 0 END) AS null_blowout,
  SUM(CASE WHEN days_rest_w IS NULL THEN 1 ELSE 0 END) AS null_days_rest,
  SUM(CASE WHEN short_week_flag_w IS NULL THEN 1 ELSE 0 END) AS null_short_week,
  SUM(CASE WHEN bye_last_week_flag_w IS NULL THEN 1 ELSE 0 END) AS null_bye_last_week,
  SUM(CASE WHEN home_flag_w IS NULL THEN 1 ELSE 0 END) AS null_home,
  SUM(CASE WHEN points_for IS NULL THEN 1 ELSE 0 END) AS null_points_for,
  SUM(CASE WHEN points_against IS NULL THEN 1 ELSE 0 END) AS null_points_against
FROM team_week_panel
WHERE Inj_Off_NextGame_w IS NOT NULL
""").df()

Unnamed: 0,rows_model_sample,null_off_snaps,null_def_snaps,null_off_ypp,null_score_diff,null_blowout,null_days_rest,null_short_week,null_bye_last_week,null_home,null_points_for,null_points_against
0,6366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Quick sanity check to confirm that the 'score_diff_w' and 'blowout_flag_w' logic is internally consistent

In [24]:
con.execute("""
SELECT
  SUM(CASE WHEN blowout_flag_w = 1 AND ABS(score_diff_w) < 14 THEN 1 ELSE 0 END) AS bad_blowout_ones,
  SUM(CASE WHEN blowout_flag_w = 0 AND ABS(score_diff_w) >= 14 THEN 1 ELSE 0 END) AS bad_blowout_zeros
FROM team_week_panel
""").df()

Unnamed: 0,bad_blowout_ones,bad_blowout_zeros
0,0.0,0.0
