We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db or notebook parent folders to contain it")

con = duckdb.connect(str(DB_FILE), read_only=False)

need_tables = ["step21_frame", "step18_preferred_model_specs"]
existing = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())
missing = [t for t in need_tables if t not in existing]

print("connected db", str(DB_FILE))
print("missing step 22 inputs", missing)
if missing:
    raise RuntimeError("Missing step 22 inputs, rerun step 21 and step 18 first")

df = con.execute("SELECT * FROM step21_frame").df()
pref = con.execute("SELECT * FROM step18_preferred_model_specs").df()

TEAM_COL = "team" if "team" in df.columns else ("team_key" if "team_key" in df.columns else None)
if TEAM_COL is None:
    raise RuntimeError("Missing team identifier column in step21_frame")

pref_def = pref[pref["side"].astype(str) == "def"].iloc[0]
pref_off = pref[pref["side"].astype(str) == "off"].iloc[0]

print("preferred def family", pref_def["family"])
print("preferred off family", pref_off["family"])
print("preferred def formula", pref_def["formula"])
print("preferred off formula", pref_off["formula"])

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
missing step 22 inputs []
preferred def family poisson
preferred off family poisson
preferred def formula Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + vol_nonscore_roll4_prior + cum_shocks_nonscore_prior + short_week_flag_w + bye_last_week_flag_w + home_flag_w + blowout_flag_w + points_for + points_against + offensive_snaps_w + defensive_snaps_w + Inj_Off_Last_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + C(team) + C(season_week)
preferred off formula Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + vol_nonscore_roll4_prior + cum_shocks_nonscore_prior + short_week_flag_w + bye_last_week_flag_w + home_flag_w + blowout_flag_w + points_for + points_against + offensive_snaps_w + defensive_snaps_w + Inj_Off_Last_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + C(team) + C(season_week)
