We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd

from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

We confirm that 'team_week_panel' exists and we set 'TEAM_COL' and helper functions used to safely rewrite the panel

In [None]:
tables = set(con.execute("SHOW TABLES").df()["name"].tolist())

if "team_week_panel" not in tables:
    raise RuntimeError("team_week_panel missing, run notebooks 01 through 06 first")

panel_cols = con.execute("PRAGMA table_info('team_week_panel')").df()
panel_cols_list = panel_cols["name"].tolist()
panel_cols_set = set(panel_cols_list)

TEAM_COL = "team_id" if "team_id" in panel_cols_set else "team"
TEAM_ABBR_COL = "team" if "team" in panel_cols_set else TEAM_COL

print("Using TEAM_COL", TEAM_COL)
print("Using TEAM_ABBR_COL", TEAM_ABBR_COL)

def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

We validate that the required workload and volatility columns are in the table and also ensure we have a complete set of inputs before attempting to compress the data using PCA

In [None]:
required_cols = [
    "season",
    "week",
    "game_id",
    TEAM_ABBR_COL,
    "offensive_snaps",
    "defensive_snaps",
    "ST_Load_NonScore_w",
    "short_week_flag",
]

missing = [c for c in required_cols if c not in panel_cols_set]
if missing:
    raise RuntimeError(f"Missing required columns in team_week_panel, {missing}")

We build a team to timezone mapping table that supports historical team abbreviations for travel and timezone shift features

In [None]:
con.execute("""
CREATE OR REPLACE TEMP TABLE team_timezone_map AS
SELECT * FROM (VALUES
  ('ARI', -7),
  ('ATL', -5),
  ('BAL', -5),
  ('BUF', -5),
  ('CAR', -5),
  ('CHI', -6),
  ('CIN', -5),
  ('CLE', -5),
  ('DAL', -6),
  ('DEN', -7),
  ('DET', -5),
  ('GB', -6),
  ('HOU', -6),
  ('IND', -5),
  ('JAX', -5),
  ('KC', -6),
  ('LA', -8),
  ('LAC', -8),
  ('LV', -8),
  ('MIA', -5),
  ('MIN', -6),
  ('NE', -5),
  ('NO', -6),
  ('NYG', -5),
  ('NYJ', -5),
  ('PHI', -5),
  ('PIT', -5),
  ('SEA', -8),
  ('SF', -8),
  ('TB', -5),
  ('TEN', -6),
  ('WAS', -5),

  ('OAK', -8),
  ('SD', -8),
  ('STL', -6)
) AS t(team, tz_utc_offset)
""")

We create a granular travel reference table that joins with the schedule to identify "away" status and also verify that the resulting travel fatigue indicators are only attributed to the team's active game weeks

In [None]:
sched_cols = _existing_cols("schedules")

date_col = None
for c in ["game_date", "gameday", "game_day", "date"]:
    if c in sched_cols:
        date_col = c
        break

if date_col is None:
    raise RuntimeError("No schedule date column found in schedules, expected one of game_date, gameday, game_day, date")

neutral_col = None
for c in ["neutral_site", "neutral", "is_neutral_site"]:
    if c in sched_cols:
        neutral_col = c
        break

neutral_expr = "0"
if neutral_col is not None:
    neutral_expr = f"COALESCE(CAST(s.{neutral_col} AS INTEGER), 0)"

con.execute(f"""
CREATE OR REPLACE TABLE team_week_travel_flags AS
WITH sched AS (
  SELECT
    season,
    week,
    game_id,
    home_team,
    away_team,
    {date_col} AS game_date,
    {neutral_expr} AS neutral_site_flag
  FROM schedules s
  WHERE s.game_type = 'REG'
    AND s.home_score IS NOT NULL
    AND s.away_score IS NOT NULL
),
keys AS (
  SELECT
    p.season,
    p.week,
    p.{TEAM_ABBR_COL} AS team,
    p.game_id
  FROM team_week_panel p
),
joined AS (
  SELECT
    k.season,
    k.week,
    k.team,
    s.home_team,
    s.away_team,
    s.game_date,
    s.neutral_site_flag,
    CASE
      WHEN k.team = s.home_team THEN 1
      WHEN k.team = s.away_team THEN 0
      ELSE NULL
    END AS is_home
  FROM keys k
  JOIN sched s
    ON k.season = s.season
   AND k.game_id = s.game_id
),
tz AS (
  SELECT
    j.*,
    th.tz_utc_offset AS team_tz,
    hh.tz_utc_offset AS home_tz
  FROM joined j
  LEFT JOIN team_timezone_map th
    ON th.team = j.team
  LEFT JOIN team_timezone_map hh
    ON hh.team = j.home_team
)
SELECT
  season,
  week,
  team,

  COALESCE(is_home, 0) AS is_home,

  CASE
    WHEN is_home = 1 THEN 0
    ELSE COALESCE(home_tz - team_tz, 0)
  END AS tz_shift_hours,

  CASE
    WHEN is_home = 0 AND (neutral_site_flag = 1 OR ABS(COALESCE(home_tz - team_tz, 0)) >= 2) THEN 1
    ELSE 0
  END AS long_travel_flag,

  CASE
    WHEN is_home = 0 AND (neutral_site_flag = 1 OR ABS(COALESCE(home_tz - team_tz, 0)) >= 1) THEN 1
    ELSE 0
  END AS timezone_change_flag,

  CASE
    WHEN is_home = 0 AND (neutral_site_flag = 1 OR COALESCE(home_tz - team_tz, 0) >= 2) THEN 1
    ELSE 0
  END AS west_to_east_flag

FROM tz
""")

Quick sanity check to confirm that the travel helper table has a matching row for every game in the main panel and also verifying that no team was left behind due to a missing abbreviation or relocation record

In [None]:
con.execute(f"""
WITH panel_keys AS (
  SELECT season, week, {TEAM_ABBR_COL} AS team
  FROM team_week_panel
),
flags_keys AS (
  SELECT season, week, team
  FROM team_week_travel_flags
),
missing_in_flags AS (
  SELECT p.*
  FROM panel_keys p
  LEFT JOIN flags_keys f
    ON p.season = f.season AND p.week = f.week AND p.team = f.team
  WHERE f.team IS NULL
),
missing_tz AS (
  SELECT
    SUM(CASE WHEN team_tz IS NULL THEN 1 ELSE 0 END) AS n_missing_team_tz,
    SUM(CASE WHEN home_tz IS NULL THEN 1 ELSE 0 END) AS n_missing_home_tz
  FROM (
    SELECT
      f.team,
      m1.tz_utc_offset AS team_tz,
      s.home_team,
      m2.tz_utc_offset AS home_tz
    FROM team_week_travel_flags f
    JOIN schedules s
      ON f.season = s.season AND f.week = s.week
     AND (s.home_team = f.team OR s.away_team = f.team)
    LEFT JOIN team_timezone_map m1
      ON m1.team = f.team
    LEFT JOIN team_timezone_map m2
      ON m2.team = s.home_team
    WHERE s.game_type = 'REG'
      AND s.home_score IS NOT NULL
      AND s.away_score IS NOT NULL
  ) t
)
SELECT
  (SELECT COUNT(*) FROM panel_keys) AS panel_rows,
  (SELECT COUNT(*) FROM flags_keys) AS travel_flag_rows,
  (SELECT COUNT(*) FROM missing_in_flags) AS n_missing_travel_flag_rows,
  (SELECT n_missing_team_tz FROM missing_tz) AS n_missing_team_tz,
  (SELECT n_missing_home_tz FROM missing_tz) AS n_missing_home_tz
""").df()