We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd
from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a 'db' folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

We verify that the primary panel aligns with the expected regular-season schedule and row counts across all seasons

In [None]:
required = ["pbp", "schedules", "team_week_panel"]
existing = set(con.execute("SHOW TABLES").df()["name"].tolist())

missing = [t for t in required if t not in existing]

print("Missing", missing)
print("OK" if not missing else "STOP, fix missing tables before continuing")

We inspect the play-by-play schema to record which optional columns exist, allowing subsequent logic to adapt safely without assuming column presence.

In [None]:
pbp_cols_df = con.execute("DESCRIBE pbp").df()
pbp_cols = set(pbp_cols_df["column_name"].tolist())

optional_flags = {
    "return_team": "return_team" in pbp_cols,
    "kickoff_type": "kickoff_type" in pbp_cols,
    "onside_kick": "onside_kick" in pbp_cols,
    "safety_kick": "safety_kick" in pbp_cols,
    "free_kick": "free_kick" in pbp_cols,
    "desc": "desc" in pbp_cols,
}

print("Optional columns present", optional_flags)

pbp_cols_df

We check that team identifiers in the panel align with those in the play-by-play data, ensuring join keys are consistent across datasets to prevent failed matches

In [None]:
panel_cols_df = con.execute("DESCRIBE team_week_panel").df()
panel_cols = set(panel_cols_df["column_name"].tolist())

if "team_id" in panel_cols:
    PANEL_TEAM_COL = "team_id"
elif "team" in panel_cols:
    PANEL_TEAM_COL = "team"
else:
    raise ValueError(f"Could not find a team column in team_week_panel, columns are {sorted(panel_cols)[:50]}")

print("Using panel team column", PANEL_TEAM_COL)

panel_teams = set(
    con.execute(f"SELECT DISTINCT {PANEL_TEAM_COL} AS team_key FROM team_week_panel WHERE {PANEL_TEAM_COL} IS NOT NULL")
      .df()["team_key"]
      .tolist()
)

pbp_posteams = set(
    con.execute("SELECT DISTINCT posteam FROM pbp WHERE posteam IS NOT NULL")
      .df()["posteam"]
      .tolist()
)

intersect = panel_teams.intersection(pbp_posteams)

print("Distinct panel teams", len(panel_teams))
print("Distinct posteam in pbp", len(pbp_posteams))
print("Intersection size", len(intersect))

print("Sample posteam not in panel", sorted(list(pbp_posteams - panel_teams))[:25])
print("Sample panel team not in pbp", sorted(list(panel_teams - pbp_posteams))[:25])


Quick sanity check to confirm that we are not accidentally pulling older schedules seasons that include those codes

In [None]:
con.execute(f"""
SELECT
  {PANEL_TEAM_COL} AS team,
  MIN(season) AS min_season,
  MAX(season) AS max_season,
  COUNT(*) AS rows
FROM team_week_panel
WHERE {PANEL_TEAM_COL} IN ('OAK','SD','STL')
GROUP BY 1
ORDER BY 1
""").df()


We create a minimal special teams base view using safe NULL placeholders for missing optional columns, ensuring downstream SQL remains stable and preventing schema-related breakages

In [None]:
select_return_team = "return_team" if optional_flags["return_team"] else "NULL::VARCHAR AS return_team"
select_kickoff_type = "kickoff_type" if optional_flags["kickoff_type"] else "NULL::VARCHAR AS kickoff_type"
select_onside_kick = "onside_kick" if optional_flags["onside_kick"] else "NULL::INTEGER AS onside_kick"
select_safety_kick = "safety_kick" if optional_flags["safety_kick"] else "NULL::INTEGER AS safety_kick"
select_free_kick = "free_kick" if optional_flags["free_kick"] else "NULL::INTEGER AS free_kick"

select_desc = "\"desc\" AS play_desc" if optional_flags["desc"] else "NULL::VARCHAR AS play_desc"

con.execute(f"""
CREATE OR REPLACE TEMP VIEW pbp_st_base AS
SELECT
  season,
  week,
  game_id,
  posteam,
  defteam,
  {select_return_team},
  play_type,
  {select_kickoff_type},
  {select_onside_kick},
  {select_safety_kick},
  {select_free_kick},
  {select_desc}
FROM pbp
""")

We filter the play-by-play data to include only regular-season games, ensuring the metric calculations align with the existing panel scope.

In [None]:
con.execute("""
CREATE OR REPLACE TEMP VIEW pbp_st_reg AS
SELECT b.*
FROM pbp_st_base b
JOIN schedules s
  ON b.game_id = s.game_id
WHERE s.game_type = 'REG'
""")

We quantify the frequency of missing "return_team" values for punt and kickoff plays, allowing for the accurate interpretation of return component counts

In [None]:
con.execute("""
SELECT
  play_type,
  COUNT(*) AS plays,
  SUM(CASE WHEN return_team IS NULL THEN 1 ELSE 0 END) AS null_return_team,
  AVG(CASE WHEN return_team IS NULL THEN 1.0 ELSE 0.0 END) AS null_return_team_rate
FROM pbp_st_reg
WHERE play_type IN ('punt', 'kickoff')
GROUP BY play_type
ORDER BY play_type
""").df()

We build a rare play condition that utilizes explicit kick flags when available and falls back to description keywords when necessary, ensuring robust identification of special teams events across varying data qualities

In [None]:
rare_parts = []

rare_parts.append("kickoff_type IN ('onside', 'safety', 'free_kick', 'free kick', 'safety kick')")
rare_parts.append("onside_kick = 1")
rare_parts.append("safety_kick = 1")
rare_parts.append("free_kick = 1")

if optional_flags["desc"]:
    rare_parts.append("play_desc ILIKE '%onside%'")
    rare_parts.append("play_desc ILIKE '%free kick%'")
    rare_parts.append("play_desc ILIKE '%safety kick%'")
    rare_parts.append("play_desc ILIKE '%fair catch kick%'")
    rare_parts.append("play_desc ILIKE '%free-kick%'")
    rare_parts.append("play_desc ILIKE '%safety-kick%'")

rare_condition = "(" + " OR ".join(rare_parts) + ")"

print("Rare condition SQL")
print(rare_condition)

We create a long events table where each row represents a single special teams event attributed to the correct team for each play component, establishing a granular foundation for team-week aggregation

In [None]:
con.execute(f"""
CREATE OR REPLACE TEMP VIEW st_events_long AS

SELECT
  season,
  week,
  posteam AS team_id,
  'punt' AS component
FROM pbp_st_reg
WHERE play_type = 'punt'
  AND posteam IS NOT NULL

UNION ALL

SELECT
  season,
  week,
  return_team AS team_id,
  'punt_return' AS component
FROM pbp_st_reg
WHERE play_type = 'punt'
  AND return_team IS NOT NULL

UNION ALL

SELECT
  season,
  week,
  posteam AS team_id,
  'kickoff' AS component
FROM pbp_st_reg
WHERE play_type = 'kickoff'
  AND posteam IS NOT NULL

UNION ALL

SELECT
  season,
  week,
  return_team AS team_id,
  'kick_return' AS component
FROM pbp_st_reg
WHERE play_type = 'kickoff'
  AND return_team IS NOT NULL

UNION ALL

SELECT
  season,
  week,
  posteam AS team_id,
  'fg' AS component
FROM pbp_st_reg
WHERE play_type = 'field_goal'
  AND posteam IS NOT NULL

UNION ALL

SELECT
  season,
  week,
  posteam AS team_id,
  'xp' AS component
FROM pbp_st_reg
WHERE play_type = 'extra_point'
  AND posteam IS NOT NULL

UNION ALL

SELECT
  season,
  week,
  posteam AS team_id,
  'rare' AS component
FROM pbp_st_reg
WHERE play_type = 'kickoff'
  AND posteam IS NOT NULL
  AND {rare_condition}
""")