We initialize Python imports and opens a DuckDB connection that every later cell reuses. We also define small helper functions for schema checks, safe table overwrites, and key column names used throughout the notebook

In [1]:
from pathlib import Path
import duckdb

DB_DIR = Path("../db")
DB_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

TEAM_ABBR_COL = "team"
SEASON_COL = "season"
WEEK_COL = "week"

def _existing_cols(table_name: str) -> list[str]:
    df = con.execute(f"DESCRIBE {table_name}").df()
    return df["column_name"].tolist()

def _star_excluding(table_name: str, alias: str, exclude_cols: list[str]) -> str:
    cols = _existing_cols(table_name)
    keep = [c for c in cols if c not in set(exclude_cols)]
    return ",\n  ".join([f'{alias}."{c}"' for c in keep])

def _first_present(cols: list[str], options: list[str]) -> str | None:
    s = set([c.lower() for c in cols])
    for opt in options:
        if opt.lower() in s:
            for c in cols:
                if c.lower() == opt.lower():
                    return c
    return None

Quick sanity check to confirm that the final dataframe is loaded correctly and contains the primary dependent variables needed for regression which prevents the model from running on an incomplete or outdated dataset

In [2]:
required = [
    SEASON_COL,
    WEEK_COL,
    TEAM_ABBR_COL,
    "game_id",
    "points_for",
    "points_against",
    "Inj_Off_NextGame_w",
    "Inj_Def_NextGame_w",
]

cols_now = _existing_cols("team_week_panel")
missing = [c for c in required if c not in cols_now]

print("Missing required columns", missing)
print("OK" if not missing else "STOP, rerun earlier notebooks before step 10")

con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

Missing required columns []
OK


Unnamed: 0,rows,distinct_keys
0,6782,6782
