We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd

from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

We confirm that 'team_week_panel' exists and we set 'TEAM_COL' and helper functions used to safely rewrite the panel

In [None]:
tables = set(con.execute("SHOW TABLES").df()["name"].tolist())

if "team_week_panel" not in tables:
    raise RuntimeError("team_week_panel missing, run notebooks 01 through 06 first")

panel_cols = con.execute("PRAGMA table_info('team_week_panel')").df()
panel_cols_list = panel_cols["name"].tolist()
panel_cols_set = set(panel_cols_list)

TEAM_COL = "team_id" if "team_id" in panel_cols_set else "team"
TEAM_ABBR_COL = "team" if "team" in panel_cols_set else TEAM_COL

print("Using TEAM_COL", TEAM_COL)
print("Using TEAM_ABBR_COL", TEAM_ABBR_COL)

def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

We validate that the required workload and volatility columns are in the table and also ensure we have a complete set of inputs before attempting to compress the data using PCA

In [None]:
required_cols = [
    "season",
    "week",
    "game_id",
    TEAM_ABBR_COL,
    "offensive_snaps",
    "defensive_snaps",
    "ST_Load_NonScore_w",
    "short_week_flag",
]

missing = [c for c in required_cols if c not in panel_cols_set]
if missing:
    raise RuntimeError(f"Missing required columns in team_week_panel, {missing}")

We build a team to timezone mapping table that supports historical team abbreviations for travel and timezone shift features

In [None]:
con.execute("""
CREATE OR REPLACE TEMP TABLE team_timezone_map AS
SELECT * FROM (VALUES
  ('ARI', -7),
  ('ATL', -5),
  ('BAL', -5),
  ('BUF', -5),
  ('CAR', -5),
  ('CHI', -6),
  ('CIN', -5),
  ('CLE', -5),
  ('DAL', -6),
  ('DEN', -7),
  ('DET', -5),
  ('GB', -6),
  ('HOU', -6),
  ('IND', -5),
  ('JAX', -5),
  ('KC', -6),
  ('LA', -8),
  ('LAC', -8),
  ('LV', -8),
  ('MIA', -5),
  ('MIN', -6),
  ('NE', -5),
  ('NO', -6),
  ('NYG', -5),
  ('NYJ', -5),
  ('PHI', -5),
  ('PIT', -5),
  ('SEA', -8),
  ('SF', -8),
  ('TB', -5),
  ('TEN', -6),
  ('WAS', -5),

  ('OAK', -8),
  ('SD', -8),
  ('STL', -6)
) AS t(team, tz_utc_offset)
""")