We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import duckdb
import pandas as pd
from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"

print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

Using DB_PATH /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


<_duckdb.DuckDBPyConnection at 0x10f408970>

We validate the prerequisites and also decide which team key column to use so that our joins in the upcoming modeling phase are perfectly aligned across different data sources

In [2]:
tables = set(con.execute("SHOW TABLES").df()["name"].tolist())

if "team_week_panel" not in tables:
    raise RuntimeError("team_week_panel missing, run notebooks 01 through 05 first")

panel_cols = con.execute("PRAGMA table_info('team_week_panel')").df()
panel_cols_list = panel_cols["name"].tolist()
panel_cols_set = set(panel_cols_list)

required_cols = [
    "season",
    "week",
    "ST_Load_All_w",
    "ST_Load_ScoreLinked_w",
    "ST_Load_NonScore_w",
    "ST_Shock_All_w",
    "ST_Shock_ScoreLinked_w",
    "ST_Shock_NonScore_w",
]
missing = [c for c in required_cols if c not in panel_cols_set]
if missing:
    raise RuntimeError(f"Missing required columns in team_week_panel, {missing}")

TEAM_COL = "team_id" if "team_id" in panel_cols_set else "team"
print("Using TEAM_COL", TEAM_COL)

def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

Using TEAM_COL team


Quick sanity check to confirm that we still have all the seasons and also weeks we started with and that the table hasn't been accidentally filtered during the recent processing steps

In [3]:
con.execute(f"""
SELECT
  season,
  COUNT(*) AS rows,
  COUNT(DISTINCT {TEAM_COL}) AS teams,
  MIN(week) AS min_week,
  MAX(week) AS max_week
FROM team_week_panel
GROUP BY season
ORDER BY season
""").df()

Unnamed: 0,season,rows,teams,min_week,max_week
0,2012,512,32,1,17
1,2013,512,32,1,17
2,2014,512,32,1,17
3,2015,512,32,1,17
4,2016,512,32,1,17
5,2017,512,32,1,17
6,2018,512,32,1,17
7,2019,512,32,1,17
8,2020,512,32,1,17
9,2021,544,32,1,18


Quick sanity check to find team seasons that do not have 17 games recorded

In [4]:
con.execute(f"""
SELECT
  season,
  {TEAM_COL} AS team,
  COUNT(*) AS n_games
FROM team_week_panel
GROUP BY season, {TEAM_COL}
HAVING season >= 2021 AND COUNT(*) <> 17
ORDER BY season, team
""").df()

Unnamed: 0,season,team,n_games
0,2022,BUF,16
1,2022,CIN,16


Quick sanity check to find which weeks are missing for those teams

In [5]:
con.execute(f"""
WITH team_counts AS (
  SELECT
    season,
    {TEAM_COL} AS team,
    COUNT(*) AS n_games
  FROM team_week_panel
  GROUP BY season, {TEAM_COL}
  HAVING season = 2022 AND COUNT(*) <> 17
),
expected_weeks AS (
  SELECT 2022 AS season, w AS week
  FROM range(1, 19) t(w)
),
team_expected AS (
  SELECT tc.team, ew.season, ew.week
  FROM team_counts tc
  CROSS JOIN expected_weeks ew
),
team_actual AS (
  SELECT season, week, {TEAM_COL} AS team
  FROM team_week_panel
  WHERE season = 2022
)
SELECT
  te.team,
  te.week AS missing_week
FROM team_expected te
LEFT JOIN team_actual ta
  ON te.season = ta.season
 AND te.week = ta.week
 AND te.team = ta.team
WHERE ta.team IS NULL
ORDER BY te.team, te.week
""").df()

Unnamed: 0,team,missing_week
0,BUF,7
1,BUF,17
2,CIN,10
3,CIN,17


Quick sanity check to confirm that the BUF and CIN schedule rows you are inspecting are not restricted to regular season games

In [6]:
con.execute("""
SELECT
  season,
  week,
  game_id,
  home_team,
  away_team,
  home_score,
  away_score
FROM schedules
WHERE season = 2022
  AND (home_team IN ('BUF','CIN') OR away_team IN ('BUF','CIN'))
ORDER BY week, game_id
""").df()

Unnamed: 0,season,week,game_id,home_team,away_team,home_score,away_score
0,2022,1,2022_01_BUF_LA,LA,BUF,10,31
1,2022,1,2022_01_PIT_CIN,CIN,PIT,20,23
2,2022,2,2022_02_CIN_DAL,DAL,CIN,20,17
3,2022,2,2022_02_TEN_BUF,BUF,TEN,41,7
4,2022,3,2022_03_BUF_MIA,MIA,BUF,21,19
5,2022,3,2022_03_CIN_NYJ,NYJ,CIN,12,27
6,2022,4,2022_04_BUF_BAL,BAL,BUF,20,23
7,2022,4,2022_04_MIA_CIN,CIN,MIA,27,15
8,2022,5,2022_05_CIN_BAL,BAL,CIN,19,17
9,2022,5,2022_05_PIT_BUF,BUF,PIT,38,3


Quick sanity check to confirm whether a BUF versus CIN matchup row exists in your schedules table for season 2022

In [7]:
con.execute("""
SELECT
  season,
  week,
  game_id,
  home_team,
  away_team,
  home_score,
  away_score
FROM schedules
WHERE season = 2022
  AND (
    (home_team = 'BUF' AND away_team = 'CIN')
    OR
    (home_team = 'CIN' AND away_team = 'BUF')
  )
ORDER BY week, game_id
""").df()

Unnamed: 0,season,week,game_id,home_team,away_team,home_score,away_score
0,2022,20,2022_20_CIN_BUF,BUF,CIN,10,27
