We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd

from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

We confirm that 'team_week_panel' exists and we set 'TEAM_COL' and helper functions used to safely rewrite the panel

In [None]:
tables = set(con.execute("SHOW TABLES").df()["name"].tolist())

if "team_week_panel" not in tables:
    raise RuntimeError("team_week_panel missing, run notebooks 01 through 06 first")

panel_cols = con.execute("PRAGMA table_info('team_week_panel')").df()
panel_cols_list = panel_cols["name"].tolist()
panel_cols_set = set(panel_cols_list)

TEAM_COL = "team_id" if "team_id" in panel_cols_set else "team"
TEAM_ABBR_COL = "team" if "team" in panel_cols_set else TEAM_COL

print("Using TEAM_COL", TEAM_COL)
print("Using TEAM_ABBR_COL", TEAM_ABBR_COL)

def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

We validate that the required workload and volatility columns are in the table and also ensure we have a complete set of inputs before attempting to compress the data using PCA

In [None]:
required_cols = [
    "season",
    "week",
    "game_id",
    TEAM_ABBR_COL,
    "offensive_snaps",
    "defensive_snaps",
    "ST_Load_NonScore_w",
    "short_week_flag",
]

missing = [c for c in required_cols if c not in panel_cols_set]
if missing:
    raise RuntimeError(f"Missing required columns in team_week_panel, {missing}")

We build a team to timezone mapping table that supports historical team abbreviations for travel and timezone shift features

In [None]:
con.execute("""
CREATE OR REPLACE TEMP TABLE team_timezone_map AS
SELECT * FROM (VALUES
  ('ARI', -7),
  ('ATL', -5),
  ('BAL', -5),
  ('BUF', -5),
  ('CAR', -5),
  ('CHI', -6),
  ('CIN', -5),
  ('CLE', -5),
  ('DAL', -6),
  ('DEN', -7),
  ('DET', -5),
  ('GB', -6),
  ('HOU', -6),
  ('IND', -5),
  ('JAX', -5),
  ('KC', -6),
  ('LA', -8),
  ('LAC', -8),
  ('LV', -8),
  ('MIA', -5),
  ('MIN', -6),
  ('NE', -5),
  ('NO', -6),
  ('NYG', -5),
  ('NYJ', -5),
  ('PHI', -5),
  ('PIT', -5),
  ('SEA', -8),
  ('SF', -8),
  ('TB', -5),
  ('TEN', -6),
  ('WAS', -5),

  ('OAK', -8),
  ('SD', -8),
  ('STL', -6)
) AS t(team, tz_utc_offset)
""")

We create a granular travel reference table that joins with the schedule to identify "away" status and also verify that the resulting travel fatigue indicators are only attributed to the team's active game weeks

In [None]:
sched_cols = _existing_cols("schedules")

date_col = None
for c in ["game_date", "gameday", "game_day", "date"]:
    if c in sched_cols:
        date_col = c
        break

if date_col is None:
    raise RuntimeError("No schedule date column found in schedules, expected one of game_date, gameday, game_day, date")

neutral_col = None
for c in ["neutral_site", "neutral", "is_neutral_site"]:
    if c in sched_cols:
        neutral_col = c
        break

neutral_expr = "0"
if neutral_col is not None:
    neutral_expr = f"COALESCE(CAST(s.{neutral_col} AS INTEGER), 0)"

con.execute(f"""
CREATE OR REPLACE TABLE team_week_travel_flags AS
WITH sched AS (
  SELECT
    season,
    week,
    game_id,
    home_team,
    away_team,
    {date_col} AS game_date,
    {neutral_expr} AS neutral_site_flag
  FROM schedules s
  WHERE s.game_type = 'REG'
    AND s.home_score IS NOT NULL
    AND s.away_score IS NOT NULL
),
keys AS (
  SELECT
    p.season,
    p.week,
    p.{TEAM_ABBR_COL} AS team,
    p.game_id
  FROM team_week_panel p
),
joined AS (
  SELECT
    k.season,
    k.week,
    k.team,
    s.home_team,
    s.away_team,
    s.game_date,
    s.neutral_site_flag,
    CASE
      WHEN s.neutral_site_flag = 1 THEN 0
      WHEN k.team = s.home_team THEN 1
      WHEN k.team = s.away_team THEN 0
      ELSE NULL
    END AS is_home
  FROM keys k
  JOIN sched s
    ON k.season = s.season
   AND k.game_id = s.game_id
),
tz AS (
  SELECT
    j.*,
    th.tz_utc_offset AS team_tz,
    hh.tz_utc_offset AS home_tz
  FROM joined j
  LEFT JOIN team_timezone_map th
    ON th.team = j.team
  LEFT JOIN team_timezone_map hh
    ON hh.team = j.home_team
)
SELECT
  season,
  week,
  team,

  COALESCE(is_home, 0) AS is_home,

  CASE
    WHEN is_home = 1 THEN 0
    ELSE COALESCE(home_tz - team_tz, 0)
  END AS tz_shift_hours,

  CASE
    WHEN is_home = 0 AND (neutral_site_flag = 1 OR ABS(COALESCE(home_tz - team_tz, 0)) >= 2) THEN 1
    ELSE 0
  END AS long_travel_flag,

  CASE
    WHEN is_home = 0 AND (neutral_site_flag = 1 OR ABS(COALESCE(home_tz - team_tz, 0)) >= 1) THEN 1
    ELSE 0
  END AS timezone_change_flag,

  CASE
    WHEN is_home = 0 AND neutral_site_flag = 0 AND COALESCE(home_tz - team_tz, 0) >= 2 THEN 1
    ELSE 0
  END AS west_to_east_flag

FROM tz
""")

Quick sanity check to confirm that the travel helper table has a matching row for every game in the main panel and also verifying that no team was left behind due to a missing abbreviation or relocation record

In [None]:
con.execute(f"""
WITH panel_keys AS (
  SELECT season, week, {TEAM_ABBR_COL} AS team
  FROM team_week_panel
),
flags_keys AS (
  SELECT season, week, team
  FROM team_week_travel_flags
),
missing_in_flags AS (
  SELECT p.*
  FROM panel_keys p
  LEFT JOIN flags_keys f
    ON p.season = f.season AND p.week = f.week AND p.team = f.team
  WHERE f.team IS NULL
),
missing_tz AS (
  SELECT
    SUM(CASE WHEN team_tz IS NULL THEN 1 ELSE 0 END) AS n_missing_team_tz,
    SUM(CASE WHEN home_tz IS NULL THEN 1 ELSE 0 END) AS n_missing_home_tz
  FROM (
    SELECT
      f.team,
      m1.tz_utc_offset AS team_tz,
      s.home_team,
      m2.tz_utc_offset AS home_tz
    FROM team_week_travel_flags f
    JOIN schedules s
      ON f.season = s.season AND f.week = s.week
     AND (s.home_team = f.team OR s.away_team = f.team)
    LEFT JOIN team_timezone_map m1
      ON m1.team = f.team
    LEFT JOIN team_timezone_map m2
      ON m2.team = s.home_team
    WHERE s.game_type = 'REG'
      AND s.home_score IS NOT NULL
      AND s.away_score IS NOT NULL
  ) t
)
SELECT
  (SELECT COUNT(*) FROM panel_keys) AS panel_rows,
  (SELECT COUNT(*) FROM flags_keys) AS travel_flag_rows,
  (SELECT COUNT(*) FROM missing_in_flags) AS n_missing_travel_flag_rows,
  (SELECT n_missing_team_tz FROM missing_tz) AS n_missing_team_tz,
  (SELECT n_missing_home_tz FROM missing_tz) AS n_missing_home_tz
""").df()

Quick sanity check to confirm that neutral site games are never marked as "home" in 'team_week_travel_flags'.

In [None]:
con.execute(f"""
WITH sched_neutral AS (
  SELECT season, game_id
  FROM schedules s
  WHERE s.game_type = 'REG'
    AND s.home_score IS NOT NULL
    AND s.away_score IS NOT NULL
    AND {neutral_expr} = 1
),
flags_neutral AS (
  SELECT
    f.season,
    f.team,
    f.is_home
  FROM team_week_travel_flags f
  JOIN team_week_panel p
    ON p.season = f.season
   AND p.week = f.week
   AND p.{TEAM_ABBR_COL} = f.team
  JOIN sched_neutral sn
    ON sn.season = p.season
   AND sn.game_id = p.game_id
)
SELECT
  COUNT(*) AS neutral_team_rows,
  SUM(CASE WHEN is_home = 1 THEN 1 ELSE 0 END) AS neutral_rows_marked_home
FROM flags_neutral
""").df()

We compute the required season-to-date cumulative workload totals, including offensive snaps, defensive snaps, and special teams load, and also verify that each weekâ€™s value includes all previous game workloads within that specific team-season

In [None]:
cols_to_replace_optional = [
    "cum_off_snaps_w",
    "cum_def_snaps_w",
    "cum_ST_Load_w",
    "cum_short_weeks_w",
    "cum_long_travel_w",
]

star = _star_excluding("team_week_panel", "base", cols_to_replace_optional + [
    "_cum_off_snaps",
    "_cum_def_snaps",
    "_cum_st_load",
    "_cum_short_weeks",
    "_cum_long_travel",
])

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
WITH base AS (
  SELECT
    p.*,

    SUM(COALESCE(p.offensive_snaps, 0)) OVER w AS _cum_off_snaps,
    SUM(COALESCE(p.defensive_snaps, 0)) OVER w AS _cum_def_snaps,
    SUM(COALESCE(p.ST_Load_NonScore_w, 0)) OVER w AS _cum_st_load,
    SUM(COALESCE(p.short_week_flag, 0)) OVER w AS _cum_short_weeks,
    SUM(COALESCE(tf.long_travel_flag, 0)) OVER w AS _cum_long_travel

  FROM team_week_panel p
  LEFT JOIN team_week_travel_flags tf
    ON tf.season = p.season
   AND tf.week = p.week
   AND tf.team = p.{TEAM_ABBR_COL}

  WINDOW w AS (
    PARTITION BY p.season, p.{TEAM_COL}
    ORDER BY p.week
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  )
)
SELECT
  {star},

  COALESCE(_cum_off_snaps, 0) AS cum_off_snaps_w,
  COALESCE(_cum_def_snaps, 0) AS cum_def_snaps_w,
  COALESCE(_cum_st_load, 0) AS cum_ST_Load_w,
  COALESCE(_cum_short_weeks, 0) AS cum_short_weeks_w,
  COALESCE(_cum_long_travel, 0) AS cum_long_travel_w

FROM base
""")

Quick sanity check to confirm that every cumulative feature, from snaps to travel counts, is fully populated for every game row, and also verifying that these totals never "reset" or drop as the season progresse

In [None]:
con.execute(f"""
WITH chk AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    week,

    cum_off_snaps_w,
    LAG(cum_off_snaps_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_off,

    cum_def_snaps_w,
    LAG(cum_def_snaps_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_def,

    cum_ST_Load_w,
    LAG(cum_ST_Load_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_st,

    cum_short_weeks_w,
    LAG(cum_short_weeks_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_short,

    cum_long_travel_w,
    LAG(cum_long_travel_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_travel
  FROM team_week_panel
)
SELECT
  SUM(CASE WHEN cum_off_snaps_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_off,
  SUM(CASE WHEN cum_def_snaps_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_def,
  SUM(CASE WHEN cum_ST_Load_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_st,
  SUM(CASE WHEN cum_short_weeks_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_short,
  SUM(CASE WHEN cum_long_travel_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_travel,

  SUM(CASE WHEN prev_cum_off IS NOT NULL AND cum_off_snaps_w < prev_cum_off THEN 1 ELSE 0 END) AS n_decreasing_cum_off,
  SUM(CASE WHEN prev_cum_def IS NOT NULL AND cum_def_snaps_w < prev_cum_def THEN 1 ELSE 0 END) AS n_decreasing_cum_def,
  SUM(CASE WHEN prev_cum_st IS NOT NULL AND cum_ST_Load_w < prev_cum_st THEN 1 ELSE 0 END) AS n_decreasing_cum_st,
  SUM(CASE WHEN prev_cum_short IS NOT NULL AND cum_short_weeks_w < prev_cum_short THEN 1 ELSE 0 END) AS n_decreasing_cum_short,
  SUM(CASE WHEN prev_cum_travel IS NOT NULL AND cum_long_travel_w < prev_cum_travel THEN 1 ELSE 0 END) AS n_decreasing_cum_travel
FROM chk
""").df()

We compute the expanded cumulative workload totals, including score-linked special teams load and timezone shift counts, and also verify that these secondary metrics are ready for testing alongside the primary inputs

In [None]:
cols_to_replace_optional = [
    "cum_ST_ScoreLinked_w",
    "cum_timezone_changes_w",
    "cum_west_to_east_w",
]

star = _star_excluding("team_week_panel", "base", cols_to_replace_optional + [
    "_cum_st_scorelinked",
    "_cum_tz_changes",
    "_cum_west_to_east",
])

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
WITH base AS (
  SELECT
    p.*,

    SUM(COALESCE(p.ST_Load_ScoreLinked_w, 0)) OVER w AS _cum_st_scorelinked,
    SUM(COALESCE(tf.timezone_change_flag, 0)) OVER w AS _cum_tz_changes,
    SUM(COALESCE(tf.west_to_east_flag, 0)) OVER w AS _cum_west_to_east

  FROM team_week_panel p
  LEFT JOIN team_week_travel_flags tf
    ON tf.season = p.season
   AND tf.week = p.week
   AND tf.team = p.{TEAM_ABBR_COL}

  WINDOW w AS (
    PARTITION BY p.season, p.{TEAM_COL}
    ORDER BY p.week
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  )
)
SELECT
  {star},

  COALESCE(_cum_st_scorelinked, 0) AS cum_ST_ScoreLinked_w,
  COALESCE(_cum_tz_changes, 0) AS cum_timezone_changes_w,
  COALESCE(_cum_west_to_east, 0) AS cum_west_to_east_w

FROM base
""")

Quick sanity check to confirm that the expanded features, such as score-linked special teams load and timezone shift counts, are fully populated for every game, and also verifying that these totals do not drop as the season continues

In [None]:
con.execute(f"""
WITH chk AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    week,

    cum_ST_ScoreLinked_w,
    LAG(cum_ST_ScoreLinked_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_st_sl,

    cum_timezone_changes_w,
    LAG(cum_timezone_changes_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_tz,

    cum_west_to_east_w,
    LAG(cum_west_to_east_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_cum_w2e
  FROM team_week_panel
)
SELECT
  SUM(CASE WHEN cum_ST_ScoreLinked_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_st_sl,
  SUM(CASE WHEN cum_timezone_changes_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_tz,
  SUM(CASE WHEN cum_west_to_east_w IS NULL THEN 1 ELSE 0 END) AS n_null_cum_w2e,

  SUM(CASE WHEN prev_cum_st_sl IS NOT NULL AND cum_ST_ScoreLinked_w < prev_cum_st_sl THEN 1 ELSE 0 END) AS n_decreasing_cum_st_sl,
  SUM(CASE WHEN prev_cum_tz IS NOT NULL AND cum_timezone_changes_w < prev_cum_tz THEN 1 ELSE 0 END) AS n_decreasing_cum_tz,
  SUM(CASE WHEN prev_cum_w2e IS NOT NULL AND cum_west_to_east_w < prev_cum_w2e THEN 1 ELSE 0 END) AS n_decreasing_cum_w2e
FROM chk
""").df()

We add extra cumulative workload controls for PCA robustness, including total snaps, cumulative rest deficit, and away game exposure, and also verify that these features are properly aligned to the team-week grain

In [None]:
extra_needed = ["days_rest"]
if "days_rest" not in panel_cols_set:
    raise RuntimeError("days_rest missing in team_week_panel, rebuild notebook 03 before Step 7.3")

cols_to_replace_optional = [
    "cum_total_snaps_w",
    "cum_rest_deficit_days_w",
    "cum_away_games_w",
    "cum_byes_w",
]

star = _star_excluding("team_week_panel", "base", cols_to_replace_optional + [
    "_cum_total_snaps",
    "_cum_rest_deficit_days",
    "_cum_away_games",
    "_cum_byes",
])

bye_expr = "0"
if "bye_last_week_flag" in panel_cols_set:
    bye_expr = "COALESCE(p.bye_last_week_flag, 0)"

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
WITH base AS (
  SELECT
    p.*,

    SUM(COALESCE(p.offensive_snaps, 0) + COALESCE(p.defensive_snaps, 0)) OVER w AS _cum_total_snaps,

    SUM(
      CASE
        WHEN p.days_rest IS NULL THEN 0
        WHEN p.days_rest < 6 THEN (6 - p.days_rest)
        ELSE 0
      END
    ) OVER w AS _cum_rest_deficit_days,

    SUM(CASE WHEN COALESCE(tf.is_home, 1) = 0 THEN 1 ELSE 0 END) OVER w AS _cum_away_games,

    SUM({bye_expr}) OVER w AS _cum_byes

  FROM team_week_panel p
  LEFT JOIN team_week_travel_flags tf
    ON tf.season = p.season
   AND tf.week = p.week
   AND tf.team = p.{TEAM_ABBR_COL}

  WINDOW w AS (
    PARTITION BY p.season, p.{TEAM_COL}
    ORDER BY p.week
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  )
)
SELECT
  {star},

  COALESCE(_cum_total_snaps, 0) AS cum_total_snaps_w,
  COALESCE(_cum_rest_deficit_days, 0) AS cum_rest_deficit_days_w,
  COALESCE(_cum_away_games, 0) AS cum_away_games_w,
  COALESCE(_cum_byes, 0) AS cum_byes_w

FROM base
""")

Quick sanity check to confirm that the newly added features, such as total snaps, cumulative rest deficit, and away game exposure, are fully populated for every game, and also verifying that these counts never decline as the season moves forward

In [None]:
con.execute(f"""
WITH chk AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    week,

    cum_total_snaps_w,
    LAG(cum_total_snaps_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_total,

    cum_rest_deficit_days_w,
    LAG(cum_rest_deficit_days_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_deficit,

    cum_away_games_w,
    LAG(cum_away_games_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_away,

    cum_byes_w,
    LAG(cum_byes_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_byes
  FROM team_week_panel
)
SELECT
  SUM(CASE WHEN cum_total_snaps_w IS NULL THEN 1 ELSE 0 END) AS n_null_total,
  SUM(CASE WHEN cum_rest_deficit_days_w IS NULL THEN 1 ELSE 0 END) AS n_null_deficit,
  SUM(CASE WHEN cum_away_games_w IS NULL THEN 1 ELSE 0 END) AS n_null_away,
  SUM(CASE WHEN cum_byes_w IS NULL THEN 1 ELSE 0 END) AS n_null_byes,

  SUM(CASE WHEN prev_total IS NOT NULL AND cum_total_snaps_w < prev_total THEN 1 ELSE 0 END) AS n_decreasing_total,
  SUM(CASE WHEN prev_deficit IS NOT NULL AND cum_rest_deficit_days_w < prev_deficit THEN 1 ELSE 0 END) AS n_decreasing_deficit,
  SUM(CASE WHEN prev_away IS NOT NULL AND cum_away_games_w < prev_away THEN 1 ELSE 0 END) AS n_decreasing_away,
  SUM(CASE WHEN prev_byes IS NOT NULL AND cum_byes_w < prev_byes THEN 1 ELSE 0 END) AS n_decreasing_byes
FROM chk
""").df()

Quick sanity check to confirm that every cumulative feature we built, including the primary workload totals, expanded sensitivity inputs, and the robustness controls, is physically present in the panel and also ready for the dimensionality reduction step

In [None]:
new_cols = [
    "cum_off_snaps_w",
    "cum_def_snaps_w",
    "cum_ST_Load_w",
    "cum_short_weeks_w",
    "cum_long_travel_w",
    "cum_ST_ScoreLinked_w",
    "cum_timezone_changes_w",
    "cum_west_to_east_w",
    "cum_total_snaps_w",
    "cum_rest_deficit_days_w",
    "cum_away_games_w",
    "cum_byes_w",
]

cols_now = _existing_cols("team_week_panel")
missing_new = [c for c in new_cols if c not in cols_now]

print("Missing new cols", missing_new)
print("OK" if not missing_new else "STOP, Step 7 did not persist correctly")