We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import os
from pathlib import Path
import duckdb

print("cwd", Path().resolve())

candidates = []
search_roots = [
    Path("."),
    Path(".."),
    Path("./data"),
    Path("../data"),
    Path("../../data"),
]
for root in search_roots:
    if root.exists():
        candidates.extend(list(root.glob("*.duckdb")))
        candidates.extend(list(root.glob("**/*.duckdb")))

seen = set()
duckdb_files = []
for f in candidates:
    fp = str(f.resolve())
    if fp not in seen:
        seen.add(fp)
        duckdb_files.append(f.resolve())

print("duckdb files found")
for i, f in enumerate(duckdb_files[:25]):
    print(i, f)

db_file = None
for f in duckdb_files:
    if f.name == "nflpa.duckdb":
        db_file = f
        break

if db_file is None and duckdb_files:
    db_file = duckdb_files[0]

if db_file is None:
    raise RuntimeError("No duckdb file found near this notebook, rerun notebook 02 or check where you stored the database file")

con = duckdb.connect(str(db_file))
print("connected db", db_file)

tables = con.execute("SHOW TABLES").df()
tables

We define small helpers for column discovery and strict required column checks so later sanity checks fail early if an upstream notebook was skipped or a column name drifted

In [None]:
import pandas as pd

SEASON_COL = "season"
WEEK_COL = "week"
TEAM_COL = "team"
PANEL_TABLE = "team_week_panel"

def _existing_cols(table_name: str) -> list[str]:
    return con.execute(f"DESCRIBE {table_name}").df()["column_name"].tolist()

def _require_cols(table_name: str, required: list[str]) -> None:
    cols = set(_existing_cols(table_name))
    missing = [c for c in required if c not in cols]
    print("Missing required columns", missing)
    if missing:
        raise RuntimeError(f"Missing columns in {table_name}, rerun earlier notebooks, missing, {missing}")

def _cols_matching(prefix: str = "", contains: str = "", suffix: str = "") -> list[str]:
    cols = _existing_cols(PANEL_TABLE)
    out = []
    for c in cols:
        if prefix and not c.startswith(prefix):
            continue
        if contains and contains not in c:
            continue
        if suffix and not c.endswith(suffix):
            continue
        out.append(c)
    return out

print("team week panel columns", len(_existing_cols(PANEL_TABLE)))

Quick sanity check to confirm that the wide-format structure correctly reflects the expected mix of injury, workload, and control features

In [None]:
con.execute("DESCRIBE team_week_panel").df().head(30)

con.execute("""
SELECT
  column_name,
  column_type
FROM (DESCRIBE team_week_panel)
ORDER BY column_name
LIMIT 30
""").df()

Quick sanity check to confirm that both schema views contain the same columns and types

In [None]:
df_a = con.execute("DESCRIBE team_week_panel").df()
df_b = con.execute("""
SELECT column_name, column_type
FROM (DESCRIBE team_week_panel)
ORDER BY column_name
""").df()

print("rows describe", len(df_a))
print("rows sorted", len(df_b))

dupe_names = df_a["column_name"].value_counts()
dupe_names = dupe_names[dupe_names > 1]
print("duplicate column names", dupe_names.to_dict())

mismatch = (
    df_a[["column_name", "column_type"]]
    .drop_duplicates()
    .merge(
        df_b[["column_name", "column_type"]].drop_duplicates(),
        on=["column_name", "column_type"],
        how="outer",
        indicator=True,
    )
)
mismatch = mismatch[mismatch["_merge"] != "both"]
mismatch.head(50)

We verify the continuity of the dataset by aggregating the unique time periods present to ensure that the panel spans the full historical range required for the analysis

In [None]:
con.execute(f"""
SELECT
  SUM(CASE WHEN name = '{PANEL_TABLE}' THEN 1 ELSE 0 END) AS has_team_week_panel
FROM (SHOW TABLES)
""").df()

_require_cols(
    PANEL_TABLE,
    [
        SEASON_COL,
        WEEK_COL,
        TEAM_COL,
        "game_id",
        "points_for",
        "points_against",
        "ST_Load_All_w",
        "ST_Load_ScoreLinked_w",
        "ST_Load_NonScore_w",
        "ST_Punt_w",
        "ST_PuntReturn_w",
        "ST_Kickoff_w",
        "ST_KickReturn_w",
        "ST_FG_w",
        "ST_XP_w",
        "ST_Rare_w",
        "Inj_Off_Next_w",
        "Inj_Def_Next_w",
    ],
)

con.execute(f"""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT {SEASON_COL}) AS seasons,
  MIN({SEASON_COL}) AS min_season,
  MAX({SEASON_COL}) AS max_season,
  MIN({WEEK_COL}) AS min_week,
  MAX({WEEK_COL}) AS max_week
FROM {PANEL_TABLE}
""").df()

Quick sanity check to confirm that 'team_week_panel' exists in the connected database 

In [None]:
con.execute("""
SELECT
  SUM(CASE WHEN name = 'team_week_panel' THEN 1 ELSE 0 END) AS has_team_week_panel
FROM (SHOW TABLES)
""").df()

Quick sanity check to confirm that 'team_week_panel' has no duplicate season-week-team rows and that the key count equals the row count to ensure that the final modeling table is a perfectly unique panel

In [None]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

dups = con.execute("""
SELECT
  season,
  week,
  team,
  COUNT(*) AS n
FROM team_week_panel
GROUP BY 1,2,3
HAVING COUNT(*) > 1
ORDER BY n DESC, season, week, team
LIMIT 50
""").df()

dups

if len(dups) > 0:
    raise RuntimeError("Duplicate season week team rows exist in team_week_panel, investigate joins before proceeding")

We verify that the "next game" indicator accurately distinguishes between teams that play in the following calendar week and those heading into a bye to ensure that the 'w+a' outcome logic aligns with the physical game schedule

In [None]:
con.execute("DROP VIEW IF EXISTS panel_next_week_flags")

con.execute(f"""
CREATE TEMP VIEW panel_next_week_flags AS
WITH base AS (
  SELECT
    {SEASON_COL} AS season,
    {WEEK_COL} AS week,
    CAST({TEAM_COL} AS VARCHAR) AS team_key
  FROM {PANEL_TABLE}
),
nxt AS (
  SELECT
    {SEASON_COL} AS season,
    {WEEK_COL} AS week,
    CAST({TEAM_COL} AS VARCHAR) AS team_key
  FROM {PANEL_TABLE}
)
SELECT
  b.season,
  b.week,
  b.team_key,
  CASE WHEN n.week IS NULL THEN 0 ELSE 1 END AS has_next_week
FROM base b
LEFT JOIN nxt n
  ON n.season = b.season
 AND n.team_key = b.team_key
 AND n.week = b.week + 1
""")

con.execute("""
SELECT
  has_next_week,
  COUNT(*) AS n_rows
FROM panel_next_week_flags
GROUP BY 1
ORDER BY 1
""").df()

Quick sanity check to confirm that next week injury outcomes are only defined when week 'w+a' exists for that team season and that they are null when the next week row is missing

In [None]:
inj_next_week_cols = ["Inj_Off_Next_w", "Inj_Def_Next_w"]
cols_now = set(_existing_cols(PANEL_TABLE))
inj_next_week_cols = [c for c in inj_next_week_cols if c in cols_now]

if not inj_next_week_cols:
    raise RuntimeError("No next week injury outcome columns found, expected Inj_Off_Next_w and Inj_Def_Next_w")

print("Next week injury columns checked", inj_next_week_cols)

select_any_defined = " OR ".join([f"p.{c} IS NOT NULL" for c in inj_next_week_cols])

bad_defined = con.execute(f"""
SELECT
  COUNT(*) AS bad_rows
FROM {PANEL_TABLE} p
JOIN panel_next_week_flags f
  ON f.season = p.{SEASON_COL}
 AND f.week = p.{WEEK_COL}
 AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
WHERE f.has_next_week = 0
  AND ({select_any_defined})
""").df()

bad_rows = int(bad_defined["bad_rows"].iloc[0])
print(bad_defined)

if bad_rows != 0:
    sample = con.execute(f"""
    SELECT
      p.{SEASON_COL} AS season,
      p.{WEEK_COL} AS week,
      p.{TEAM_COL} AS team,
      f.has_next_week,
      {", ".join([f"p.{c}" for c in inj_next_week_cols])}
    FROM {PANEL_TABLE} p
    JOIN panel_next_week_flags f
      ON f.season = p.{SEASON_COL}
     AND f.week = p.{WEEK_COL}
     AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
    WHERE f.has_next_week = 0
      AND ({select_any_defined})
    ORDER BY p.{SEASON_COL} DESC, p.{WEEK_COL} DESC, p.{TEAM_COL}
    LIMIT 50
    """).df()
    print(sample)
    raise RuntimeError("Next week injury outcomes are populated when week plus 1 does not exist, fix notebook 09 logic or apply the step 11 repair cell")

We handle the undefined next week cases by creating a modeling view that drops all rows where week 'w+1' does not exist, which are the final game weeks and the pre bye game weeks.

In [None]:
con.execute("DROP VIEW IF EXISTS team_week_panel_nextweek_model")

con.execute(f"""
CREATE VIEW team_week_panel_nextweek_model AS
SELECT
  p.*,
  f.has_next_week
FROM {PANEL_TABLE} p
JOIN panel_next_week_flags f
  ON f.season = p.{SEASON_COL}
 AND f.week = p.{WEEK_COL}
 AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
WHERE f.has_next_week = 1
""")

con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  COUNT(DISTINCT {SEASON_COL} || '-' || {WEEK_COL} || '-' || {TEAM_COL}) AS distinct_keys_model
FROM team_week_panel_nextweek_model
""").df()

con.execute(f"""
SELECT
  p.{SEASON_COL} AS season,
  COUNT(*) AS rows_total,
  SUM(CASE WHEN f.has_next_week = 1 THEN 1 ELSE 0 END) AS rows_kept,
  SUM(CASE WHEN f.has_next_week = 0 THEN 1 ELSE 0 END) AS rows_dropped
FROM {PANEL_TABLE} p
JOIN panel_next_week_flags f
  ON f.season = p.{SEASON_COL}
 AND f.week = p.{WEEK_COL}
 AND f.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
GROUP BY 1
ORDER BY 1
""").df()

Quick sanity check to confirm that the special teams totals exactly match the intended bucket identities

In [None]:
st_identity = con.execute(f"""
SELECT
  SUM(CASE WHEN ST_Load_All_w != (ST_Punt_w + ST_PuntReturn_w + ST_Kickoff_w + ST_KickReturn_w + ST_FG_w + ST_XP_w + ST_Rare_w) THEN 1 ELSE 0 END) AS bad_all_identity,
  SUM(CASE WHEN ST_Load_ScoreLinked_w != (ST_Kickoff_w + ST_XP_w + ST_FG_w) THEN 1 ELSE 0 END) AS bad_scorelinked_identity,
  SUM(CASE WHEN ST_Load_NonScore_w != (ST_Punt_w + ST_PuntReturn_w + ST_KickReturn_w + ST_Rare_w) THEN 1 ELSE 0 END) AS bad_nonscore_identity,
  SUM(CASE WHEN ST_Load_All_w != (ST_Load_ScoreLinked_w + ST_Load_NonScore_w) THEN 1 ELSE 0 END) AS bad_partition_identity
FROM {PANEL_TABLE}
""").df()

print(st_identity)

bad_all = int(st_identity["bad_all_identity"].iloc[0])
bad_sl = int(st_identity["bad_scorelinked_identity"].iloc[0])
bad_ns = int(st_identity["bad_nonscore_identity"].iloc[0])
bad_part = int(st_identity["bad_partition_identity"].iloc[0])

if bad_all or bad_sl or bad_ns or bad_part:
    sample = con.execute(f"""
    SELECT
      {SEASON_COL} AS season,
      {WEEK_COL} AS week,
      {TEAM_COL} AS team,
      ST_Punt_w,
      ST_PuntReturn_w,
      ST_Kickoff_w,
      ST_KickReturn_w,
      ST_FG_w,
      ST_XP_w,
      ST_Rare_w,
      ST_Load_All_w,
      ST_Load_ScoreLinked_w,
      ST_Load_NonScore_w
    FROM {PANEL_TABLE}
    WHERE
      ST_Load_All_w != (ST_Punt_w + ST_PuntReturn_w + ST_Kickoff_w + ST_KickReturn_w + ST_FG_w + ST_XP_w + ST_Rare_w)
      OR ST_Load_ScoreLinked_w != (ST_Kickoff_w + ST_XP_w + ST_FG_w)
      OR ST_Load_NonScore_w != (ST_Punt_w + ST_PuntReturn_w + ST_KickReturn_w + ST_Rare_w)
      OR ST_Load_All_w != (ST_Load_ScoreLinked_w + ST_Load_NonScore_w)
    ORDER BY season DESC, week DESC, team
    LIMIT 50
    """).df()
    print(sample)
    raise RuntimeError("Special teams buckets do not satisfy the intended identities, investigate notebook 04 bucketing logic")

Quick sanity check to confirm that 'ScoreLinked' moves with the points environment more than 'NonScore' by comparing correlations with total game points

In [None]:
df_points = con.execute(f"""
SELECT
  {SEASON_COL} AS season,
  {WEEK_COL} AS week,
  {TEAM_COL} AS team,
  points_for,
  points_against,
  ST_Load_ScoreLinked_w,
  ST_Load_NonScore_w
FROM {PANEL_TABLE}
""").df()

df_points["total_points_game"] = df_points["points_for"] + df_points["points_against"]

df_corr = df_points.dropna(subset=["total_points_game", "ST_Load_ScoreLinked_w", "ST_Load_NonScore_w"]).copy()

corr_scorelinked = df_corr["ST_Load_ScoreLinked_w"].corr(df_corr["total_points_game"])
corr_nonscore = df_corr["ST_Load_NonScore_w"].corr(df_corr["total_points_game"])

print("corr ScoreLinked with total points game", corr_scorelinked)
print("corr NonScore with total points game", corr_nonscore)

if abs(corr_scorelinked) < abs(corr_nonscore):
    raise RuntimeError("ScoreLinked is not more correlated with points environment than NonScore, investigate ScoreLinked definition and joins")

by_season = (
    df_corr.groupby("season", as_index=False)
    .apply(
        lambda g: pd.Series({
            "corr_scorelinked_total_points": g["ST_Load_ScoreLinked_w"].corr(g["total_points_game"]),
            "corr_nonscore_total_points": g["ST_Load_NonScore_w"].corr(g["total_points_game"]),
            "n_rows": len(g),
        }),
        include_groups=False,
    )
    .reset_index(drop=True)
)

by_season = by_season.reset_index(drop=True)
by_season.sort_values("season").tail(15)

Quick sanity check to confirm that 'NonScore' correlates more with punts and returns than with touchdowns by constructing a touchdowns per team week table from play by play and comparing correlations

In [None]:
con.execute("DROP TABLE IF EXISTS team_week_tds_off")

con.execute("""
CREATE TABLE team_week_tds_off AS
SELECT
  season,
  week,
  CAST(posteam AS VARCHAR) AS team,
  SUM(CASE WHEN touchdown = 1 THEN 1 ELSE 0 END) AS tds_off_w
FROM pbp
WHERE posteam IS NOT NULL
GROUP BY 1,2,3
""")

con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys,
  MIN(tds_off_w) AS min_tds,
  MAX(tds_off_w) AS max_tds,
  AVG(tds_off_w) AS avg_tds
FROM team_week_tds_off
""").df()

df_ns = con.execute(f"""
SELECT
  p.{SEASON_COL} AS season,
  p.{WEEK_COL} AS week,
  p.{TEAM_COL} AS team,
  p.ST_Load_NonScore_w,
  p.ST_Punt_w,
  p.ST_PuntReturn_w,
  p.ST_KickReturn_w,
  COALESCE(t.tds_off_w, 0) AS tds_off_w
FROM {PANEL_TABLE} p
LEFT JOIN team_week_tds_off t
  ON t.season = p.{SEASON_COL}
 AND t.week = p.{WEEK_COL}
 AND t.team = CAST(p.{TEAM_COL} AS VARCHAR)
""").df()

df_ns["punt_return_proxy"] = df_ns["ST_Punt_w"] + df_ns["ST_PuntReturn_w"] + df_ns["ST_KickReturn_w"]

df_ns_corr = df_ns.dropna(subset=["ST_Load_NonScore_w", "punt_return_proxy", "tds_off_w"]).copy()

corr_ns_punts = df_ns_corr["ST_Load_NonScore_w"].corr(df_ns_corr["punt_return_proxy"])
corr_ns_tds = df_ns_corr["ST_Load_NonScore_w"].corr(df_ns_corr["tds_off_w"])

print("corr NonScore with punts and returns proxy", corr_ns_punts)
print("corr NonScore with offensive touchdowns", corr_ns_tds)

if abs(corr_ns_punts) < abs(corr_ns_tds):
    raise RuntimeError("NonScore is not more correlated with punts and returns than touchdowns, investigate NonScore definition and component mapping")

by_season_ns = (
    df_ns_corr.groupby("season", as_index=False)
    .apply(
        lambda g: pd.Series({
            "corr_nonscore_punts_returns": g["ST_Load_NonScore_w"].corr(g["punt_return_proxy"]),
            "corr_nonscore_tds_off": g["ST_Load_NonScore_w"].corr(g["tds_off_w"]),
            "n_rows": len(g),
        }),
        include_groups=False,
    )
    .reset_index(drop=True)
)

by_season_ns = by_season_ns.reset_index(drop=True)
by_season_ns.sort_values("season").tail(15)

Quick sanity check to confirm that the next week modeling view has no unexpected nulls in core exposures and controls, which protects against join failures that can potentially collapse sample size later

In [None]:
core_cols = [
    "ST_Load_NonScore_w",
    "ST_Load_ScoreLinked_w",
    "ST_Load_All_w",
    "Z_ST_NonScore_w",
    "ST_Shock_NonScore_w",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "off_yards_per_play_w",
    "points_for",
    "points_against",
    "score_diff_w",
    "blowout_flag_w",
    "days_rest_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "Inj_Off_Next_w",
    "Inj_Def_Next_w",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
]

cols_now = set(_existing_cols("team_week_panel_nextweek_model"))
core_cols = [c for c in core_cols if c in cols_now]

null_expr = ",\n  ".join([f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS null_{c}" for c in core_cols])

nulls = con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  {null_expr}
FROM team_week_panel_nextweek_model
""").df()

nulls

bad = []
for c in core_cols:
    v = int(nulls[f"null_{c}"].iloc[0])
    if v != 0:
        bad.append((c, v))

print("Nonzero null counts in model view", bad)

if bad:
    raise RuntimeError("Model view contains nulls in required exposures or controls, rerun upstream joins and inspect the listed columns")

We aggregate the results of our structural and integrity tests into a single, high-level summary table, providing a centralized "health report" for the dataset before the final regression begins

In [None]:
con.execute("DROP TABLE IF EXISTS panel_step11_diagnostics")

con.execute(f"""
CREATE TABLE panel_step11_diagnostics AS
SELECT
  CURRENT_TIMESTAMP AS created_at,
  (SELECT COUNT(*) FROM {PANEL_TABLE}) AS rows_panel,
  (SELECT COUNT(*) FROM team_week_panel_nextweek_model) AS rows_nextweek_model,
  (SELECT COUNT(*) FROM {PANEL_TABLE}) - (SELECT COUNT(*) FROM team_week_panel_nextweek_model) AS rows_dropped_no_next_week,
  {float(corr_scorelinked)} AS corr_scorelinked_total_points,
  {float(corr_nonscore)} AS corr_nonscore_total_points,
  {float(corr_ns_punts)} AS corr_nonscore_punts_returns,
  {float(corr_ns_tds)} AS corr_nonscore_tds_off
""")

con.execute("SELECT * FROM panel_step11_diagnostics").df()

Quick sanity check to confirm that the rows missing 'w+1' behave like expected at the team season level, and that the 2017 and 2022 deviations are not a broken panel join

In [None]:
missing_dist = con.execute("""
WITH per_team AS (
  SELECT
    season,
    team_key,
    SUM(CASE WHEN has_next_week = 0 THEN 1 ELSE 0 END) AS n_no_next_week,
    COUNT(*) AS n_rows
  FROM panel_next_week_flags
  GROUP BY 1,2
)
SELECT
  season,
  n_no_next_week,
  COUNT(*) AS n_teams
FROM per_team
GROUP BY 1,2
ORDER BY season, n_no_next_week
""").df()

print(missing_dist)

odd_teams = con.execute("""
WITH per_team AS (
  SELECT
    season,
    team_key,
    SUM(CASE WHEN has_next_week = 0 THEN 1 ELSE 0 END) AS n_no_next_week,
    COUNT(*) AS n_rows
  FROM panel_next_week_flags
  GROUP BY 1,2
)
SELECT *
FROM per_team
WHERE n_no_next_week NOT IN (1, 2, 3)
ORDER BY season, team_key
""").df()

print("team seasons with unexpected n_no_next_week", len(odd_teams))
if len(odd_teams) != 0:
    print(odd_teams.head(50))
    raise RuntimeError("Unexpected missing next week pattern, investigate schedule ingestion, canceled games, or missing team weeks")

Quick sanity check to confirm that the special teams bucket identities cannot pass if any contributing columns are null

In [None]:
st_identity_hardened = con.execute(f"""
SELECT
  SUM(
    CASE
      WHEN ST_Load_All_w IS NULL
        OR ST_Punt_w IS NULL
        OR ST_PuntReturn_w IS NULL
        OR ST_Kickoff_w IS NULL
        OR ST_KickReturn_w IS NULL
        OR ST_FG_w IS NULL
        OR ST_XP_w IS NULL
        OR ST_Rare_w IS NULL
      THEN 1
      WHEN ST_Load_All_w != (ST_Punt_w + ST_PuntReturn_w + ST_Kickoff_w + ST_KickReturn_w + ST_FG_w + ST_XP_w + ST_Rare_w)
      THEN 1
      ELSE 0
    END
  ) AS bad_all_identity,
  SUM(
    CASE
      WHEN ST_Load_ScoreLinked_w IS NULL
        OR ST_Kickoff_w IS NULL
        OR ST_XP_w IS NULL
        OR ST_FG_w IS NULL
      THEN 1
      WHEN ST_Load_ScoreLinked_w != (ST_Kickoff_w + ST_XP_w + ST_FG_w)
      THEN 1
      ELSE 0
    END
  ) AS bad_scorelinked_identity,
  SUM(
    CASE
      WHEN ST_Load_NonScore_w IS NULL
        OR ST_Punt_w IS NULL
        OR ST_PuntReturn_w IS NULL
        OR ST_KickReturn_w IS NULL
        OR ST_Rare_w IS NULL
      THEN 1
      WHEN ST_Load_NonScore_w != (ST_Punt_w + ST_PuntReturn_w + ST_KickReturn_w + ST_Rare_w)
      THEN 1
      ELSE 0
    END
  ) AS bad_nonscore_identity,
  SUM(
    CASE
      WHEN ST_Load_All_w IS NULL
        OR ST_Load_ScoreLinked_w IS NULL
        OR ST_Load_NonScore_w IS NULL
      THEN 1
      WHEN ST_Load_All_w != (ST_Load_ScoreLinked_w + ST_Load_NonScore_w)
      THEN 1
      ELSE 0
    END
  ) AS bad_partition_identity
FROM {PANEL_TABLE}
""").df()

print(st_identity_hardened)

bad_any = int(st_identity_hardened[["bad_all_identity","bad_scorelinked_identity","bad_nonscore_identity","bad_partition_identity"]].sum(axis=1).iloc[0])
if bad_any != 0:
    raise RuntimeError("Special teams bucket identities failed or encountered null inputs, investigate notebook 04 bucketing and joins")

Quick sanity check to confirm that the touchdown proxy join is actually matching the panel rows, and that 'NonScore' still relates more to punts and returns than to touchdowns using a 'game_id' based join

In [None]:
con.execute("DROP VIEW IF EXISTS game_team_tds_off")

con.execute("""
CREATE TEMP VIEW game_team_tds_off AS
SELECT
  game_id,
  CAST(posteam AS VARCHAR) AS team_key,
  SUM(CASE WHEN touchdown = 1 THEN 1 ELSE 0 END) AS tds_off_w
FROM pbp
WHERE posteam IS NOT NULL
GROUP BY 1,2
""")

df_join_check = con.execute(f"""
SELECT
  SUM(CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END) AS missing_rows,
  COUNT(*) AS total_rows,
  CAST(SUM(CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS missing_rate
FROM {PANEL_TABLE} p
LEFT JOIN game_team_tds_off t
  ON t.game_id = p.game_id
 AND t.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
""").df()

df_join_check

panel_codes = con.execute(f"""
SELECT DISTINCT CAST({TEAM_COL} AS VARCHAR) AS team_code
FROM {PANEL_TABLE}
ORDER BY 1
""").df()["team_code"].tolist()

pbp_codes = con.execute("""
SELECT DISTINCT CAST(posteam AS VARCHAR) AS team_code
FROM pbp
WHERE posteam IS NOT NULL
ORDER BY 1
""").df()["team_code"].tolist()

panel_set = set(panel_codes)
pbp_set = set(pbp_codes)

print("panel codes not in pbp posteam codes", sorted(panel_set - pbp_set))
print("pbp posteam codes not in panel codes", sorted(pbp_set - panel_set))

missing_breakdown = con.execute(f"""
WITH tds_raw AS (
  SELECT
    game_id,
    CAST(posteam AS VARCHAR) AS posteam_raw,
    SUM(CASE WHEN touchdown = 1 THEN 1 ELSE 0 END) AS tds_off_w
  FROM pbp
  WHERE posteam IS NOT NULL
  GROUP BY 1,2
),
joined AS (
  SELECT
    p.{SEASON_COL} AS season,
    CAST(p.{TEAM_COL} AS VARCHAR) AS team_code,
    CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END AS missing_join
  FROM {PANEL_TABLE} p
  LEFT JOIN tds_raw t
    ON t.game_id = p.game_id
   AND t.posteam_raw = CAST(p.{TEAM_COL} AS VARCHAR)
)
SELECT
  season,
  team_code,
  SUM(missing_join) AS n_missing,
  COUNT(*) AS n_rows
FROM joined
GROUP BY 1,2
HAVING SUM(missing_join) > 0
ORDER BY n_missing DESC, season, team_code
LIMIT 50
""").df()

missing_breakdown

candidate_aliases = {
    "STL": "LA",
    "LAR": "LA",
    "SD": "LAC",
    "OAK": "LV",
    "JAC": "JAX",
    "WSH": "WAS",
}

pbp_set = set(con.execute("""
SELECT DISTINCT CAST(posteam AS VARCHAR) AS team_code
FROM pbp
WHERE posteam IS NOT NULL
""").df()["team_code"].tolist())

panel_set = set(con.execute(f"""
SELECT DISTINCT CAST({TEAM_COL} AS VARCHAR) AS team_code
FROM {PANEL_TABLE}
""").df()["team_code"].tolist())

final_pairs = []
for src, dst in candidate_aliases.items():
    if src in pbp_set and dst in panel_set and src not in panel_set:
        final_pairs.append((src, dst))

alias_df = pd.DataFrame(final_pairs, columns=["pbp_team", "panel_team"])
print("alias pairs used", alias_df.to_dict(orient="records"))

con.register("team_key_alias_map", alias_df)

con.execute("DROP VIEW IF EXISTS game_team_tds_off")

con.execute("""
CREATE TEMP VIEW game_team_tds_off AS
SELECT
  p.game_id,
  COALESCE(m.panel_team, CAST(p.posteam AS VARCHAR)) AS team_key,
  SUM(CASE WHEN p.touchdown = 1 THEN 1 ELSE 0 END) AS tds_off_w
FROM pbp p
LEFT JOIN team_key_alias_map m
  ON m.pbp_team = CAST(p.posteam AS VARCHAR)
WHERE p.posteam IS NOT NULL
GROUP BY 1,2
""")

df_join_check2 = con.execute(f"""
SELECT
  SUM(CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END) AS missing_rows,
  COUNT(*) AS total_rows,
  CAST(SUM(CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS missing_rate
FROM {PANEL_TABLE} p
LEFT JOIN game_team_tds_off t
  ON t.game_id = p.game_id
 AND t.team_key = CAST(p.{TEAM_COL} AS VARCHAR)
""").df()

df_join_check2

panel_to_pbp_aliases = {
    "STL": "LA",
    "SD": "LAC",
    "OAK": "LV",
}

panel_set = set(con.execute(f"""
SELECT DISTINCT CAST({TEAM_COL} AS VARCHAR) AS team_code
FROM {PANEL_TABLE}
""").df()["team_code"].tolist())

pbp_set = set(con.execute("""
SELECT DISTINCT CAST(posteam AS VARCHAR) AS team_code
FROM pbp
WHERE posteam IS NOT NULL
""").df()["team_code"].tolist())

final_pairs = []
for src_panel, dst_pbp in panel_to_pbp_aliases.items():
    if src_panel in panel_set and dst_pbp in pbp_set:
        final_pairs.append((src_panel, dst_pbp))

alias_df = pd.DataFrame(final_pairs, columns=["panel_team", "pbp_team"])
print("panel to pbp alias pairs used", alias_df.to_dict(orient="records"))

con.register("panel_to_pbp_team_map", alias_df)

df_join_check3 = con.execute(f"""
SELECT
  SUM(CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END) AS missing_rows,
  COUNT(*) AS total_rows,
  CAST(SUM(CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS missing_rate
FROM {PANEL_TABLE} p
LEFT JOIN game_team_tds_off t
  ON t.game_id = p.game_id
 AND t.team_key = COALESCE(
   (SELECT pbp_team FROM panel_to_pbp_team_map m WHERE m.panel_team = CAST(p.{TEAM_COL} AS VARCHAR)),
   CAST(p.{TEAM_COL} AS VARCHAR)
 )
""").df()

df_join_check3


We recomputes the diagnostics inside SQL so that in the future, we don't need to depend on Python variables from earlier cells

In [None]:
con.execute("DROP TABLE IF EXISTS panel_step11_diagnostics")

con.execute(f"""
CREATE TABLE panel_step11_diagnostics AS
WITH joined AS (
  SELECT
    p.*,
    t.tds_off_w,
    CASE WHEN t.tds_off_w IS NULL THEN 1 ELSE 0 END AS missing_tds_join
  FROM {PANEL_TABLE} p
  LEFT JOIN panel_to_pbp_team_map m
    ON m.panel_team = CAST(p.{TEAM_COL} AS VARCHAR)
  LEFT JOIN game_team_tds_off t
    ON t.game_id = p.game_id
   AND t.team_key = COALESCE(m.pbp_team, CAST(p.{TEAM_COL} AS VARCHAR))
),
agg AS (
  SELECT
    CURRENT_TIMESTAMP AS created_at,
    COUNT(*) AS rows_panel,
    (SELECT COUNT(*) FROM team_week_panel_nextweek_model) AS rows_nextweek_model,
    COUNT(*) - (SELECT COUNT(*) FROM team_week_panel_nextweek_model) AS rows_dropped_no_next_week,
    corr(CAST(ST_Load_ScoreLinked_w AS DOUBLE), CAST(points_for + points_against AS DOUBLE)) AS corr_scorelinked_total_points,
    corr(CAST(ST_Load_NonScore_w AS DOUBLE), CAST(points_for + points_against AS DOUBLE)) AS corr_nonscore_total_points,
    corr(CAST(ST_Load_NonScore_w AS DOUBLE), CAST(ST_Punt_w + ST_PuntReturn_w + ST_KickReturn_w AS DOUBLE)) AS corr_nonscore_punts_returns,
    corr(CAST(ST_Load_NonScore_w AS DOUBLE), CAST(COALESCE(tds_off_w, 0) AS DOUBLE)) AS corr_nonscore_tds_off,
    AVG(CAST(missing_tds_join AS DOUBLE)) AS tds_join_missing_rate
  FROM joined
)
SELECT * FROM agg
""")

con.execute("SELECT * FROM panel_step11_diagnostics").df()