We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import duckdb
import pandas as pd

from pathlib import Path

import numpy as np

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

print(con.execute("SELECT COUNT(*) AS rows FROM team_week_panel").df())

Using DB_PATH /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
   rows
0  6782


We confirm that the 'team_week_panel' table, which houses our rows keyed by season, week, and team ID, is physically present in the workspace, sets the team and abbreviation columns to match the configuration used for gathering raw data and building the panel, and also loads the necessary utility functions for the PCA preparation

In [2]:
tables = set(con.execute("SHOW TABLES").df()["name"].tolist())

if "team_week_panel" not in tables:
    raise RuntimeError("team_week_panel missing, run notebooks 01 through 07 first")

panel_cols = con.execute("PRAGMA table_info('team_week_panel')").df()
panel_cols_list = panel_cols["name"].tolist()
panel_cols_set = set(panel_cols_list)

TEAM_COL = "team_id" if "team_id" in panel_cols_set else "team"
TEAM_ABBR_COL = "team" if "team" in panel_cols_set else TEAM_COL

print("Using TEAM_COL", TEAM_COL)
print("Using TEAM_ABBR_COL", TEAM_ABBR_COL)

def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

print("team" in panel_cols_set, "team_id" in panel_cols_set)

Using TEAM_COL team
Using TEAM_ABBR_COL team
True False


We define the exact set of cumulative and per-game workload columns that will feed into the PCA, and also performs a strict validation check against the 'team_week_panel' to ensure all previously persisted data is available.

In [3]:
required_inputs = [
    "cum_off_snaps_w",
    "cum_def_snaps_w",
    "cum_ST_Load_w",
    "cum_short_weeks_w",
    "cum_long_travel_w",
    "cum_timezone_changes_w",
    "cum_west_to_east_w",
    "cum_total_snaps_w",
    "cum_rest_deficit_days_w",
    "cum_away_games_w",
    "cum_byes_w",
]

cols_now = _existing_cols("team_week_panel")

missing_required = [c for c in required_inputs if c not in cols_now]
print("Missing required inputs", missing_required)

if missing_required:
    raise RuntimeError(
        "Step 8 cannot run because required cumulative columns are missing, missing are "
        + ", ".join(missing_required)
    )

pca_inputs = required_inputs

print("Final PCA input columns used", pca_inputs)

Missing required inputs []
Final PCA input columns used ['cum_off_snaps_w', 'cum_def_snaps_w', 'cum_ST_Load_w', 'cum_short_weeks_w', 'cum_long_travel_w', 'cum_timezone_changes_w', 'cum_west_to_east_w', 'cum_total_snaps_w', 'cum_rest_deficit_days_w', 'cum_away_games_w', 'cum_byes_w']


We pull the PCA inputs into pandas, verifies that each team-week has a unique index to prevent data leakage, handles any missing values through imputation or zero-filling, and then standardizes each variable across the entire dataset to prepare for the Principal Component Analysis

In [4]:
df = con.execute(
    f"""
    SELECT
      season,
      week,
      {TEAM_ABBR_COL} AS team_key,
      {", ".join(pca_inputs)}
    FROM team_week_panel
    """
).df()

if df.duplicated(subset=["season", "week", "team_key"]).any():
    n_dup = int(df.duplicated(subset=["season", "week", "team_key"]).sum())
    raise RuntimeError(f"Duplicate keys found in extracted panel, duplicates {n_dup}")

for c in pca_inputs:
    df[c] = pd.to_numeric(df[c], errors="coerce")

missing_rates = df[pca_inputs].isna().mean().sort_values(ascending=False)

print("Missing rate per input")
print(missing_rates)

df[pca_inputs] = df[pca_inputs].fillna(0.0)

means = df[pca_inputs].mean(axis=0)
sds = df[pca_inputs].std(axis=0, ddof=0)

zero_sd = [c for c in pca_inputs if float(sds[c]) == 0.0 or np.isclose(float(sds[c]), 0.0)]
use_inputs = [c for c in pca_inputs if c not in zero_sd]

print("Zero standard deviation inputs dropped", zero_sd)
print("Inputs used after drop", use_inputs)

if len(use_inputs) < 3:
    raise RuntimeError("Too few usable inputs for PCA after dropping zero variance columns")

Z = (df[use_inputs] - means[use_inputs]) / sds[use_inputs]
Z = Z.replace([np.inf, -np.inf], 0.0).fillna(0.0)

print("Z shape", Z.shape)
print(Z.mean().sort_values())
print(Z.std(ddof=0).sort_values())

Missing rate per input
cum_off_snaps_w            0.0
cum_def_snaps_w            0.0
cum_ST_Load_w              0.0
cum_short_weeks_w          0.0
cum_long_travel_w          0.0
cum_timezone_changes_w     0.0
cum_west_to_east_w         0.0
cum_total_snaps_w          0.0
cum_rest_deficit_days_w    0.0
cum_away_games_w           0.0
cum_byes_w                 0.0
dtype: float64
Zero standard deviation inputs dropped []
Inputs used after drop ['cum_off_snaps_w', 'cum_def_snaps_w', 'cum_ST_Load_w', 'cum_short_weeks_w', 'cum_long_travel_w', 'cum_timezone_changes_w', 'cum_west_to_east_w', 'cum_total_snaps_w', 'cum_rest_deficit_days_w', 'cum_away_games_w', 'cum_byes_w']
Z shape (6782, 11)
cum_total_snaps_w         -1.246750e-16
cum_ST_Load_w             -8.407705e-17
cum_short_weeks_w         -4.295525e-17
cum_long_travel_w         -3.771681e-17
cum_off_snaps_w           -3.562143e-17
cum_west_to_east_w        -2.933529e-17
cum_rest_deficit_days_w    1.676303e-17
cum_byes_w                 4.

We run PCA on the standardized cumulative and per-game rate inputs using a single component, then extracts the PC1 scores to create our 'Cumulative_Workload_Index_w' while reporting the explained variance ratio to confirm how much information is retained

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1, svd_solver="full")
pc1_scores = pca.fit_transform(Z.values).reshape(-1)

explained_var_ratio_pc1 = float(pca.explained_variance_ratio_[0])

loadings = pd.Series(pca.components_[0], index=use_inputs, name="loading_pc1")

print("Explained variance ratio PC1", explained_var_ratio_pc1)
print("Top loadings by absolute value")
print(loadings.reindex(loadings.abs().sort_values(ascending=False).head(15).index))

Explained variance ratio PC1 0.6374030207696648
Top loadings by absolute value
cum_total_snaps_w          0.359774
cum_def_snaps_w            0.358970
cum_away_games_w           0.358475
cum_off_snaps_w            0.358460
cum_ST_Load_w              0.354028
cum_byes_w                 0.290195
cum_timezone_changes_w     0.285650
cum_rest_deficit_days_w    0.276442
cum_short_weeks_w          0.273364
cum_long_travel_w          0.181837
cum_west_to_east_w         0.096682
Name: loading_pc1, dtype: float64


We orient the first principal component so that higher values consistently represent higher cumulative workload, and also flips the sign if PC1 is negatively correlated with total snaps to ensure the index is logically interpretable

In [6]:
anchor = df["cum_total_snaps_w"].values.astype(float)
corr_before = float(np.corrcoef(pc1_scores, anchor)[0, 1])

flip = 1.0
if np.isfinite(corr_before) and corr_before < 0:
    flip = -1.0

pc1_scores_oriented = pc1_scores * flip
loadings_oriented = loadings * flip

corr_after = float(np.corrcoef(pc1_scores_oriented, anchor)[0, 1])

print("Correlation with cum_total_snaps_w before flip", corr_before)
print("Flip applied", flip)
print("Correlation with cum_total_snaps_w after flip", corr_after)

df_index = df[["season", "week", "team_key"]].copy()
df_index["Cumulative_Workload_Index_w"] = pc1_scores_oriented.astype(float)

Correlation with cum_total_snaps_w before flip 0.9526503824732212
Flip applied 1.0
Correlation with cum_total_snaps_w after flip 0.9526503824732212


We archive the transformation weights and scaling factors to ensure the research is reproducible, while documenting the explained variance and orientation of the first principal component for long-term auditability within the project

In [7]:
import datetime as dt

now_utc = dt.datetime.now(dt.UTC)
version = now_utc.strftime("%Y%m%d_%H%M%S_utc")
created_utc = now_utc.isoformat()

artifacts = pd.DataFrame({
    "version": version,
    "column_name": use_inputs,
    "mean": means[use_inputs].values.astype(float),
    "sd": sds[use_inputs].values.astype(float),
    "loading_pc1": loadings_oriented.loc[use_inputs].values.astype(float),
})

summary = pd.DataFrame([{
    "version": version,
    "n_rows": int(len(df)),
    "n_inputs_requested": int(len(pca_inputs)),
    "n_inputs_used": int(len(use_inputs)),
    "explained_variance_ratio_pc1": float(explained_var_ratio_pc1),
    "pc1_flip": float(flip),
    "created_utc": created_utc,
}])

con.register("pca_artifacts_df", artifacts)
con.register("pca_summary_df", summary)

con.execute("""
CREATE TABLE IF NOT EXISTS pca_cum_workload_artifacts (
  version VARCHAR,
  column_name VARCHAR,
  mean DOUBLE,
  sd DOUBLE,
  loading_pc1 DOUBLE
)
""")

con.execute("""
CREATE TABLE IF NOT EXISTS pca_cum_workload_summary (
  version VARCHAR,
  n_rows BIGINT,
  n_inputs_requested BIGINT,
  n_inputs_used BIGINT,
  explained_variance_ratio_pc1 DOUBLE,
  pc1_flip DOUBLE,
  created_utc VARCHAR
)
""")

con.execute("INSERT INTO pca_cum_workload_artifacts SELECT * FROM pca_artifacts_df")
con.execute("INSERT INTO pca_cum_workload_summary SELECT * FROM pca_summary_df")

print(con.execute("SELECT * FROM pca_cum_workload_summary ORDER BY created_utc DESC LIMIT 3").df())

               version  n_rows  n_inputs_requested  n_inputs_used  \
0  20260103_212306_utc    6782                  11             11   
1  20260103_210903_utc    6782                  11             11   
2  20260103_210809_utc    6782                  11             11   

   explained_variance_ratio_pc1  pc1_flip                       created_utc  
0                      0.637403       1.0  2026-01-03T21:23:06.079576+00:00  
1                      0.637403       1.0        2026-01-03T21:09:03.020411  
2                      0.637403       1.0  2026-01-03T21:08:09.869236+00:00  


We join our newly derived PCA scores onto the master panel using the team and week identifiers, followed by a density check to confirm that the integration is 100% complete across the entire longitudinal series

In [8]:
df_index_for_db = df_index.rename(columns={"team_key": TEAM_ABBR_COL})
con.register("pca_index_df", df_index_for_db)

con.execute("DROP TABLE IF EXISTS pca_cum_workload_index_tmp")
con.execute("""
CREATE TABLE pca_cum_workload_index_tmp AS
SELECT * FROM pca_index_df
""")

pre_rows = int(con.execute("SELECT COUNT(*) AS n FROM team_week_panel").df()["n"].iloc[0])

star = _star_excluding("team_week_panel", "p", ["Cumulative_Workload_Index_w"])

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
SELECT
  {star},
  i.Cumulative_Workload_Index_w
FROM team_week_panel p
LEFT JOIN pca_cum_workload_index_tmp i
USING (season, week, {TEAM_ABBR_COL})
""")

post_rows = int(con.execute("SELECT COUNT(*) AS n FROM team_week_panel").df()["n"].iloc[0])

nulls = con.execute("""
SELECT
  SUM(CASE WHEN Cumulative_Workload_Index_w IS NULL THEN 1 ELSE 0 END) AS n_null
FROM team_week_panel
""").df()["n_null"].iloc[0]

print("Rows before", pre_rows)
print("Rows after", post_rows)
print("Null index rows", nulls)

if pre_rows != post_rows:
    raise RuntimeError("Row count changed after adding the index, investigate key duplication or join mismatch")

if nulls != 0:
    raise RuntimeError("Index has nulls after join, investigate missing keys in pca_index_df")

Rows before 6782
Rows after 6782
Null index rows 0.0
