We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd

from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a 'db' folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

Quick sanity check to confirm that the target table exists in the database and contains the necessary special teams metrics from the previous step to ensure the environment is ready for statistical analysis

In [None]:
required_tables = ["team_week_panel", "schedules"]
existing = set(con.execute("SHOW TABLES").df()["name"].tolist())
missing_tables = [t for t in required_tables if t not in existing]

print("Missing tables", missing_tables)
print("OK" if not missing_tables else "STOP, fix missing tables before continuing")

panel_cols_df = con.execute("DESCRIBE team_week_panel").df()
panel_cols = set(panel_cols_df["column_name"].tolist())

required_cols = [
    "season",
    "week",
    "game_id",
    "ST_Load_All_w",
    "ST_Load_ScoreLinked_w",
    "ST_Load_NonScore_w",
]

missing_cols = [c for c in required_cols if c not in panel_cols]

print("Missing required columns in team_week_panel", missing_cols)
print("OK" if not missing_cols else "STOP, Step 4 outputs missing from panel")

We detect the primary team join key by inspecting the panel schema, allowing the downstream processing logic to remain agnostic to naming conventions between different data versions

In [None]:
if "team_id" in panel_cols:
    PANEL_TEAM_COL = "team_id"
elif "team" in panel_cols:
    PANEL_TEAM_COL = "team"
else:
    raise ValueError("Could not find a team column in team_week_panel")

print("Using panel team column", PANEL_TEAM_COL)

We create a base view that identifies regular season game weeks to ensure that season-level team statistics are not skewed by bye weeks or post-season data

In [None]:
con.execute(f"""
CREATE OR REPLACE TEMP VIEW panel_step5_base AS
SELECT
  p.*,
  s.game_type AS sched_game_type,
  CASE WHEN p.game_id IS NOT NULL AND s.game_type = 'REG' THEN 1 ELSE 0 END AS is_reg_game_week
FROM team_week_panel p
LEFT JOIN schedules s
  ON p.game_id = s.game_id
""")

We compute the mean and standard deviation for every workload category at the team-season level, using only active regular season weeks, and persist the results in a reference table for future normalization

In [None]:
con.execute(f"""
CREATE OR REPLACE TABLE team_season_st_stats AS
SELECT
  season,
  {PANEL_TEAM_COL} AS team,

  AVG(ST_Load_All_w) AS mean_ST_All,
  STDDEV_SAMP(ST_Load_All_w) AS sd_ST_All,

  AVG(ST_Load_ScoreLinked_w) AS mean_ST_ScoreLinked,
  STDDEV_SAMP(ST_Load_ScoreLinked_w) AS sd_ST_ScoreLinked,

  AVG(ST_Load_NonScore_w) AS mean_ST_NonScore,
  STDDEV_SAMP(ST_Load_NonScore_w) AS sd_ST_NonScore,

  COUNT(*) AS n_game_weeks
FROM panel_step5_base
WHERE is_reg_game_week = 1
GROUP BY season, {PANEL_TEAM_COL}
""")

Quick sanity check to confirm that each team-season has a unique entry in the summary table, ensuring the game counts align with a typical season length and that the standard deviation is successfully calculated

In [None]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || team) AS distinct_team_seasons,
  MIN(n_game_weeks) AS min_game_weeks,
  MAX(n_game_weeks) AS max_game_weeks,
  SUM(CASE WHEN sd_ST_All IS NULL OR sd_ST_All = 0 THEN 1 ELSE 0 END) AS zero_or_null_sd_all,
  SUM(CASE WHEN sd_ST_NonScore IS NULL OR sd_ST_NonScore = 0 THEN 1 ELSE 0 END) AS zero_or_null_sd_nonscore
FROM team_season_st_stats
""").df()

Quick sanity check to confirm that we can catch cases where the standard deviation is zero or missing, ensuring we don't cause errors when dividing to calculate z-scores

In [None]:
con.execute("""
SELECT
  season,
  team,
  n_game_weeks,
  mean_ST_All,
  sd_ST_All,
  mean_ST_NonScore,
  sd_ST_NonScore
FROM team_season_st_stats
WHERE sd_ST_All IS NULL OR sd_ST_All = 0 OR sd_ST_NonScore IS NULL OR sd_ST_NonScore = 0
ORDER BY season, team
""").df()

We join the team season statistics back to the individual game-week rows and calculate the z-scores, defaulting to zero if the standard deviation is zero or if the week isn't a regular season game to keep the data clean

In [None]:
con.execute(f"""
CREATE OR REPLACE TEMP VIEW panel_step5_z AS
SELECT
  p.*,

  ts.mean_ST_All,
  ts.sd_ST_All,
  ts.mean_ST_ScoreLinked,
  ts.sd_ST_ScoreLinked,
  ts.mean_ST_NonScore,
  ts.sd_ST_NonScore,
  ts.n_game_weeks,

  CASE
    WHEN ts.mean_ST_All IS NOT NULL AND ts.mean_ST_All > 0 THEN 1 ELSE 0
  END AS has_st_data_team_season,

  CASE
    WHEN p.is_reg_game_week = 1
     AND ts.mean_ST_All IS NOT NULL AND ts.mean_ST_All > 0
     AND ts.sd_ST_All IS NOT NULL AND ts.sd_ST_All > 0
      THEN (p.ST_Load_All_w - ts.mean_ST_All) / ts.sd_ST_All
    ELSE 0
  END AS Z_ST_All_w,

  CASE
    WHEN p.is_reg_game_week = 1
     AND ts.mean_ST_All IS NOT NULL AND ts.mean_ST_All > 0
     AND ts.sd_ST_ScoreLinked IS NOT NULL AND ts.sd_ST_ScoreLinked > 0
      THEN (p.ST_Load_ScoreLinked_w - ts.mean_ST_ScoreLinked) / ts.sd_ST_ScoreLinked
    ELSE 0
  END AS Z_ST_ScoreLinked_w,

  CASE
    WHEN p.is_reg_game_week = 1
     AND ts.mean_ST_All IS NOT NULL AND ts.mean_ST_All > 0
     AND ts.sd_ST_NonScore IS NOT NULL AND ts.sd_ST_NonScore > 0
      THEN (p.ST_Load_NonScore_w - ts.mean_ST_NonScore) / ts.sd_ST_NonScore
    ELSE 0
  END AS Z_ST_NonScore_w

FROM panel_step5_base p
LEFT JOIN team_season_st_stats ts
  ON p.season = ts.season
 AND p.{PANEL_TEAM_COL} = ts.team
""")

Quick sanity check to confirm that every regular season game has a valid z-score and that the values average out to approximately zero for each team's season, verifying the normalization was successful

In [None]:
con.execute("""
SELECT
  SUM(CASE WHEN is_reg_game_week = 1 AND mean_ST_All IS NULL THEN 1 ELSE 0 END) AS reg_weeks_missing_stats,
  SUM(CASE WHEN is_reg_game_week = 1 AND Z_ST_All_w IS NULL THEN 1 ELSE 0 END) AS reg_weeks_missing_z_all,
  SUM(CASE WHEN is_reg_game_week = 1 AND Z_ST_NonScore_w IS NULL THEN 1 ELSE 0 END) AS reg_weeks_missing_z_nonscore
FROM panel_step5_z
""").df()

Quick sanity check to confirm that every regular season game has a valid z-score and that the values average out to approximately zero for each team's season, verifying the normalization was successful

In [None]:
con.execute(f"""
SELECT
  season,
  {PANEL_TEAM_COL} AS team,
  AVG(Z_ST_All_w) AS avg_z_all,
  AVG(Z_ST_ScoreLinked_w) AS avg_z_scorelinked,
  AVG(Z_ST_NonScore_w) AS avg_z_nonscore,
  COUNT(*) AS reg_rows
FROM panel_step5_z
WHERE is_reg_game_week = 1
GROUP BY season, {PANEL_TEAM_COL}
ORDER BY ABS(AVG(Z_ST_NonScore_w)) DESC
LIMIT 25
""").df()