We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd
from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"

print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

We validate the prerequisites and also decide which team key column to use so that our joins in the upcoming modeling phase are perfectly aligned across different data sources

In [None]:
tables = set(con.execute("SHOW TABLES").df()["name"].tolist())

if "team_week_panel" not in tables:
    raise RuntimeError("team_week_panel missing, run notebooks 01 through 05 first")

panel_cols = con.execute("PRAGMA table_info('team_week_panel')").df()
panel_cols_list = panel_cols["name"].tolist()
panel_cols_set = set(panel_cols_list)

required_cols = [
    "season",
    "week",
    "ST_Load_All_w",
    "ST_Load_ScoreLinked_w",
    "ST_Load_NonScore_w",
    "ST_Shock_All_w",
    "ST_Shock_ScoreLinked_w",
    "ST_Shock_NonScore_w",
]
missing = [c for c in required_cols if c not in panel_cols_set]
if missing:
    raise RuntimeError(f"Missing required columns in team_week_panel, {missing}")

TEAM_COL = "team_id" if "team_id" in panel_cols_set else "team"
print("Using TEAM_COL", TEAM_COL)

def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

Quick sanity check to confirm that we still have all the seasons and also weeks we started with and that the table hasn't been accidentally filtered during the recent processing steps

In [None]:
con.execute(f"""
SELECT
  season,
  COUNT(*) AS rows,
  COUNT(DISTINCT {TEAM_COL}) AS teams,
  MIN(week) AS min_week,
  MAX(week) AS max_week
FROM team_week_panel
GROUP BY season
ORDER BY season
""").df()

Quick sanity check to find team seasons that do not have 17 games recorded

In [None]:
con.execute(f"""
SELECT
  season,
  {TEAM_COL} AS team,
  COUNT(*) AS n_games
FROM team_week_panel
GROUP BY season, {TEAM_COL}
HAVING season >= 2021 AND COUNT(*) <> 17
ORDER BY season, team
""").df()

Quick sanity check to find which weeks are missing for those teams

In [None]:
con.execute(f"""
WITH team_counts AS (
  SELECT
    season,
    {TEAM_COL} AS team,
    COUNT(*) AS n_games
  FROM team_week_panel
  GROUP BY season, {TEAM_COL}
  HAVING season = 2022 AND COUNT(*) <> 17
),
expected_weeks AS (
  SELECT 2022 AS season, w AS week
  FROM range(1, 19) t(w)
),
team_expected AS (
  SELECT tc.team, ew.season, ew.week
  FROM team_counts tc
  CROSS JOIN expected_weeks ew
),
team_actual AS (
  SELECT season, week, {TEAM_COL} AS team
  FROM team_week_panel
  WHERE season = 2022
)
SELECT
  te.team,
  te.week AS missing_week
FROM team_expected te
LEFT JOIN team_actual ta
  ON te.season = ta.season
 AND te.week = ta.week
 AND te.team = ta.team
WHERE ta.team IS NULL
ORDER BY te.team, te.week
""").df()

Quick sanity check to confirm that the BUF and CIN schedule rows you are inspecting are not restricted to regular season games

In [None]:
con.execute("""
SELECT
  season,
  week,
  game_id,
  home_team,
  away_team,
  home_score,
  away_score
FROM schedules
WHERE season = 2022
  AND (home_team IN ('BUF','CIN') OR away_team IN ('BUF','CIN'))
ORDER BY week, game_id
""").df()

Quick sanity check to confirm whether a BUF versus CIN matchup row exists in your schedules table for season 2022

In [None]:
con.execute("""
SELECT
  season,
  week,
  game_id,
  home_team,
  away_team,
  home_score,
  away_score
FROM schedules
WHERE season = 2022
  AND (
    (home_team = 'BUF' AND away_team = 'CIN')
    OR
    (home_team = 'CIN' AND away_team = 'BUF')
  )
ORDER BY week, game_id
""").df()

We compute season-to-date volatility measures for each special teams workload bucket and also ensure that these rolling statistics capture how much a team's special teams usage fluctuates as the season progresses

In [None]:
cols_to_replace_optional = [
    "ST_Games_ToDate_w",
    "ST_Vol_All_w",
    "ST_Vol_ScoreLinked_w",
    "ST_Vol_NonScore_w",
]

star = _star_excluding("team_week_panel", "base", cols_to_replace_optional + [
    "_st_n_to_date",
    "_st_vol_all_raw",
    "_st_vol_scorelinked_raw",
    "_st_vol_nonscore_raw",
])

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
WITH base AS (
  SELECT
    p.*,

    COUNT(*) OVER w AS _st_n_to_date,

    STDDEV_SAMP(ST_Load_All_w) OVER w AS _st_vol_all_raw,
    STDDEV_SAMP(ST_Load_ScoreLinked_w) OVER w AS _st_vol_scorelinked_raw,
    STDDEV_SAMP(ST_Load_NonScore_w) OVER w AS _st_vol_nonscore_raw

  FROM team_week_panel p
  WINDOW w AS (
    PARTITION BY season, {TEAM_COL}
    ORDER BY week
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  )
)
SELECT
  {star},

  _st_n_to_date AS ST_Games_ToDate_w,

  CASE
    WHEN _st_n_to_date < 2 THEN 0
    ELSE COALESCE(_st_vol_all_raw, 0)
  END AS ST_Vol_All_w,

  CASE
    WHEN _st_n_to_date < 2 THEN 0
    ELSE COALESCE(_st_vol_scorelinked_raw, 0)
  END AS ST_Vol_ScoreLinked_w,

  CASE
    WHEN _st_n_to_date < 2 THEN 0
    ELSE COALESCE(_st_vol_nonscore_raw, 0)
  END AS ST_Vol_NonScore_w

FROM base
""")

Quick sanity check to confirm that our new volatility calculations didn't result in empty data points and also checking that even the early-season weeks have a default or starting volatility value assigned

In [None]:
con.execute(f"""
SELECT
  SUM(CASE WHEN ST_Games_ToDate_w IS NULL THEN 1 ELSE 0 END) AS n_null_games_to_date,
  SUM(CASE WHEN ST_Vol_All_w IS NULL THEN 1 ELSE 0 END) AS n_null_vol_all,
  SUM(CASE WHEN ST_Vol_ScoreLinked_w IS NULL THEN 1 ELSE 0 END) AS n_null_vol_scorelinked,
  SUM(CASE WHEN ST_Vol_NonScore_w IS NULL THEN 1 ELSE 0 END) AS n_null_vol_nonscore
FROM team_week_panel
""").df()

Quick sanity check to confirm that the volatility starts at exactly zero for every team's first game of the season and also ensuring that our rolling standard deviation logic doesn't inherit values from the previous year

In [None]:
con.execute(f"""
WITH first_games AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    MIN(week) AS first_week
  FROM team_week_panel
  GROUP BY season, {TEAM_COL}
)
SELECT
  COUNT(*) AS first_game_rows,
  SUM(CASE WHEN p.ST_Vol_All_w = 0 THEN 1 ELSE 0 END) AS vol_all_zero_on_first,
  SUM(CASE WHEN p.ST_Vol_ScoreLinked_w = 0 THEN 1 ELSE 0 END) AS vol_scorelinked_zero_on_first,
  SUM(CASE WHEN p.ST_Vol_NonScore_w = 0 THEN 1 ELSE 0 END) AS vol_nonscore_zero_on_first
FROM team_week_panel p
JOIN first_games f
  ON p.season = f.season
 AND p.{TEAM_COL} = f.team_key
 AND p.week = f.first_week
""").df()