We initialize Python imports and opens a DuckDB connection that every later cell reuses. We also load the preferred specs and the modeling frame, then confirm seasons are present and ordered so the split is chronological

In [1]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

warnings.filterwarnings("ignore", category=RuntimeWarning)

CWD = Path().resolve()
DB_FILE = None
for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        DB_FILE = cand
        break
if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            DB_FILE = cand
            break
if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb")

con = duckdb.connect(str(DB_FILE), read_only=False)

need = ["step18_model_frame", "step18_preferred_model_specs"]
existing = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())
missing = [t for t in need if t not in existing]
if missing:
    raise RuntimeError(f"Missing tables for step19, {missing}, run notebook 18 first")

df = con.execute("SELECT * FROM step18_model_frame").df()
pref = con.execute("SELECT * FROM step18_preferred_model_specs").df()

print("rows in step18_model_frame", len(df))
print("preferred specs")
print(pref)

if "season" not in df.columns or "week" not in df.columns:
    raise RuntimeError("Missing season or week in step18_model_frame")

seasons = sorted(df["season"].dropna().astype(int).unique().tolist())
print("seasons", seasons[:10], "to", seasons[-10:])
if len(seasons) < 3:
    raise RuntimeError("Need at least 3 seasons for chronological cross validation")

rows in step18_model_frame 5950
preferred specs
  side         outcome   family                 spec_id  \
0  def  Inj_Def_Next_w  poisson  nonscore_roll4_no_lags   
1  off  Inj_Off_Next_w  poisson  nonscore_roll4_no_lags   

                                             formula           aic  \
0  Inj_Def_Next_w ~ shock_nonscore + shock_x_blow...  20805.188499   
1  Inj_Off_Next_w ~ shock_nonscore + shock_x_blow...  20178.233429   

            bic  
0  22551.577735  
1  21924.622665  
seasons [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021] to [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
