We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import duckdb
from pathlib import Path

candidates = []
search_roots = [
    Path("."),
    Path(".."),
    Path("./data"),
    Path("../data"),
    Path("../../data"),
]

for root in search_roots:
    if root.exists():
        candidates.extend(list(root.glob("nflpa.duckdb")))
        candidates.extend(list(root.glob("**/nflpa.duckdb")))

seen = set()
duckdb_files = []
for f in candidates:
    fp = str(f.resolve())
    if fp not in seen:
        seen.add(fp)
        duckdb_files.append(f.resolve())

print("nflpa.duckdb candidates found", len(duckdb_files))
for i, f in enumerate(duckdb_files[:25]):
    print(i, f)

if not duckdb_files:
    raise RuntimeError("No nflpa.duckdb found near this notebook, run notebook 00 or check where you saved the database")

DB_PATH = duckdb_files[0]
print("using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("""
SELECT
  COUNT(*) AS n_tables
FROM (SHOW TABLES)
""").df()

nflpa.duckdb candidates found 2
0 /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
1 /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/data/nflpa.duckdb
using DB_PATH /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


Unnamed: 0,n_tables
0,46


Quick sanity check to confirm that the specialized database view has been successfully materialized and that it contains the complete set of features required for the analysis

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

SEASON_COL = "season"
WEEK_COL = "week"
TEAM_COL = "team"

PANEL_TABLE = "team_week_panel"
MODEL_VIEW = "team_week_panel_nextweek_model"

def _existing_cols(table_name: str) -> list[str]:
    return con.execute(f"DESCRIBE {table_name}").df()["column_name"].tolist()

def _require_cols(table_name: str, required: list[str]) -> None:
    cols = set(_existing_cols(table_name))
    missing = [c for c in required if c not in cols]
    print("Missing required columns", missing)
    if missing:
        raise RuntimeError(f"Missing columns in {table_name}, rerun upstream notebooks, missing, {missing}")

required_step12 = [
    SEASON_COL,
    WEEK_COL,
    TEAM_COL,
    "game_id",
    "points_for",
    "points_against",
    "ST_Load_All_w",
    "ST_Load_ScoreLinked_w",
    "ST_Load_NonScore_w",
    "ST_Vol_All_w",
    "ST_Vol_ScoreLinked_w",
    "ST_Vol_NonScore_w",
    "ST_Shock_All_w",
    "ST_Shock_ScoreLinked_w",
    "ST_Shock_NonScore_w",
    "Inj_Off_Next_w",
    "Inj_Def_Next_w",
]

_require_cols(MODEL_VIEW, required_step12)

null_check_expr = ",\n  ".join([f"SUM(CASE WHEN {c} IS NULL THEN 1 ELSE 0 END) AS null_{c}" for c in required_step12])

nulls = con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  {null_check_expr}
FROM {MODEL_VIEW}
""").df()

bad = []
for c in required_step12:
    v = int(nulls[f"null_{c}"].iloc[0])
    if v != 0:
        bad.append((c, v))

print("Nonzero null counts in Step 12 required columns", bad)
if bad:
    raise RuntimeError("Step 12 required columns contain nulls in the model view, rerun upstream joins and inspect the listed columns")

nulls

CatalogException: Catalog Error: Table with name panel_next_week_flags does not exist!
Did you mean "injuries_team_week_players"?