We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db or notebook parent folders to contain it")

con = duckdb.connect(str(DB_FILE), read_only=False)

BASE_TABLE = "step16_modeling_frame_nolookahead"

exists_df = con.execute(f"""
SELECT COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{BASE_TABLE}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {BASE_TABLE}, run notebook 16 first")

print("connected db", str(DB_FILE))
print("base table", BASE_TABLE)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
base table step16_modeling_frame_nolookahead


We build a modeling table that adds rolling volatility and prior season to date volatility and prior cumulative shock, and construct 'ScoreLinked' and 'All' diagnostics without changing the main 'NonScore'

In [2]:
df = con.execute(f"SELECT * FROM {BASE_TABLE}").df()

desc = con.execute(f"DESCRIBE {BASE_TABLE}").df()
cols = set(desc["column_name"].astype(str).tolist())

def pick_col(candidates: list[str], available: set[str], label: str) -> str:
    for c in candidates:
        if c in available:
            return c
    raise RuntimeError(f"Missing {label}, add its exact name into candidates, available columns include {sorted(list(available))[:40]}")

TEAM_COL = pick_col(["team", "team_key"], cols, "team id column")
SEASON_COL = pick_col(["season"], cols, "season column")
WEEK_COL = pick_col(["week"], cols, "week column")

if "season_week" not in df.columns:
    df["season_week"] = (df[SEASON_COL].astype(int) * 100 + df[WEEK_COL].astype(int)).astype(int)

if "load_nonscore" not in df.columns:
    load_candidates = ["ST_Load_NonScore_w", "ST_Load_NonScore", "ST_Load_NonScore_w"]
    load_col = pick_col(load_candidates, cols, "NonScore load column for volatility construction")
    df["load_nonscore"] = df[load_col].astype(float)

if "shock_nonscore" not in df.columns:
    shock_candidates = ["ST_Shock_NonScore_w", "shock_nonscore"]
    shock_col = pick_col(shock_candidates, cols, "NonScore shock column")
    df["shock_nonscore"] = df[shock_col].fillna(0).astype(int)

if "blowout_flag_w" not in df.columns:
    raise RuntimeError("Missing blowout_flag_w in the step16 frame, rerun notebook 10 and 11 then rebuild step16")

df = df.sort_values([TEAM_COL, SEASON_COL, WEEK_COL]).reset_index(drop=True)
g = df.groupby([TEAM_COL, SEASON_COL], sort=False)

df["cum_shocks_nonscore_prior"] = g["shock_nonscore"].cumsum().shift(1).fillna(0).astype(int)

df["vol_nonscore_s2d_prior"] = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().std(ddof=1).shift(1))
    .reset_index(level=[0, 1], drop=True)
    .fillna(0.0)
    .astype(float)
)

df["vol_nonscore_roll4_prior"] = (
    g["load_nonscore"]
    .apply(lambda s: s.shift(1).rolling(4, min_periods=2).std(ddof=1))
    .reset_index(level=[0, 1], drop=True)
    .fillna(df["vol_nonscore_s2d_prior"])
    .astype(float)
)

for k in [1, 2, 3]:
    col = f"ST_Shock_NonScore_w_minus_{k}"
    if col not in df.columns:
        df[col] = g["shock_nonscore"].shift(k).fillna(0).astype(int)

if "shock_x_blowout" not in df.columns:
    df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

MODEL_VIEW = "team_week_panel_nextweek_model"
mv_exists = con.execute(f"""
SELECT COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{MODEL_VIEW}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()
if int(mv_exists["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to recreate it")

mv_desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
mv_cols = set(mv_desc["column_name"].astype(str).tolist())

scorelinked_col = None
all_col = None
if "ST_Load_ScoreLinked_w" in cols:
    scorelinked_col = "ST_Load_ScoreLinked_w"
if "ST_Load_All_w" in cols:
    all_col = "ST_Load_All_w"

need_join = (scorelinked_col is None) or (all_col is None)
if need_join:
    if "ST_Load_ScoreLinked_w" in mv_cols:
        scorelinked_col = "ST_Load_ScoreLinked_w"
    if "ST_Load_All_w" in mv_cols:
        all_col = "ST_Load_All_w"

if scorelinked_col is not None or all_col is not None:
    join_cols = [SEASON_COL, WEEK_COL, TEAM_COL]
    sel = [f"a.*"]
    if scorelinked_col is not None and scorelinked_col not in cols:
        sel.append(f"b.{scorelinked_col} AS {scorelinked_col}")
    if all_col is not None and all_col not in cols:
        sel.append(f"b.{all_col} AS {all_col}")

    if len(sel) > 1:
        df = con.execute(f"""
        SELECT {", ".join(sel)}
        FROM df a
        LEFT JOIN {MODEL_VIEW} b
        ON a.{SEASON_COL} = b.{SEASON_COL}
        AND a.{WEEK_COL} = b.{WEEK_COL}
        AND a.{TEAM_COL} = b.{TEAM_COL}
        """).df()

def add_diag_prefix(load_col: str, prefix: str):
    df[f"load_{prefix}"] = df[load_col].astype(float)

    mean_prior = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"load_{prefix}"]
        .apply(lambda s: s.expanding().mean().shift(1))
        .reset_index(level=[0, 1], drop=True)
    )
    sd_prior = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"load_{prefix}"]
        .apply(lambda s: s.expanding().std(ddof=1).shift(1))
        .reset_index(level=[0, 1], drop=True)
    )

    z_prior = (df[f"load_{prefix}"] - mean_prior) / sd_prior
    df[f"shock_{prefix}"] = (z_prior >= 1).fillna(False).astype(int)
    df[f"shock_x_blowout_{prefix}"] = (df[f"shock_{prefix}"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

    df[f"vol_{prefix}_s2d_prior"] = sd_prior.fillna(0.0).astype(float)
    df[f"vol_{prefix}_roll4_prior"] = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"load_{prefix}"]
        .apply(lambda s: s.shift(1).rolling(4, min_periods=2).std(ddof=1))
        .reset_index(level=[0, 1], drop=True)
        .fillna(df[f"vol_{prefix}_s2d_prior"])
        .astype(float)
    )
    df[f"cum_shocks_{prefix}_prior"] = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"shock_{prefix}"]
        .cumsum()
        .shift(1)
        .fillna(0)
        .astype(int)
    )

if scorelinked_col is not None and scorelinked_col in df.columns:
    add_diag_prefix(scorelinked_col, "scorelinked")

if all_col is not None and all_col in df.columns:
    add_diag_prefix(all_col, "all")

con.register("step18_model_frame_tmp", df)
con.execute("CREATE OR REPLACE TABLE step18_model_frame AS SELECT * FROM step18_model_frame_tmp")
con.unregister("step18_model_frame_tmp")

print("wrote duckdb table step18_model_frame")
print("rows", len(df))

wrote duckdb table step18_model_frame
rows 5950


Quick sanity check to confirm that every candidate spec will use a consistent non missing modeling sample and the key exposure variables have variation

In [3]:
df = con.execute("SELECT * FROM step18_model_frame").df()

required = [
    "shock_nonscore",
    "shock_x_blowout",
    "cum_shocks_nonscore_prior",
    "vol_nonscore_s2d_prior",
    "vol_nonscore_roll4_prior",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
    "blowout_flag_w",
]

missing = [c for c in required if c not in df.columns]
if missing:
    raise RuntimeError(f"Missing required step18 columns, {missing}")

nulls = df[required].isna().sum().sort_values(ascending=False)
print("null counts in required columns")
print(nulls)

print("shock_nonscore value counts")
print(df["shock_nonscore"].value_counts(dropna=False).sort_index())

print("vol_nonscore_s2d_prior min max")
print(float(df["vol_nonscore_s2d_prior"].min()), float(df["vol_nonscore_s2d_prior"].max()))

print("cum_shocks_nonscore_prior min max")
print(int(df["cum_shocks_nonscore_prior"].min()), int(df["cum_shocks_nonscore_prior"].max()))

null counts in required columns
shock_nonscore                 0
shock_x_blowout                0
cum_shocks_nonscore_prior      0
vol_nonscore_s2d_prior         0
vol_nonscore_roll4_prior       0
ST_Shock_NonScore_w_minus_1    0
ST_Shock_NonScore_w_minus_2    0
ST_Shock_NonScore_w_minus_3    0
blowout_flag_w                 0
dtype: int64
shock_nonscore value counts
shock_nonscore
0    4943
1    1007
Name: count, dtype: int64
vol_nonscore_s2d_prior min max
0.0 9.192388155425117
cum_shocks_nonscore_prior min max
0 7


We define consistent AIC and BIC calculation helpers to avoid common pitfalls, such as using deviance-based BIC or failing to account for the fact that robust clustering affects inference without altering the likelihood criteria

In [4]:
df = con.execute("SELECT * FROM step18_model_frame").df()

TEAM_COL = "team" if "team" in df.columns else "team_key"
SEASON_COL = "season"
WEEK_COL = "week"

OUTCOME_DEF = "Inj_Def_Next_w"
OUTCOME_OFF = "Inj_Off_Next_w"

for out in [OUTCOME_DEF, OUTCOME_OFF]:
    if out not in df.columns:
        raise RuntimeError(f"Missing outcome {out}, rerun notebook 9 through 11 then rebuild step16 and step18")

def aic_bic(llf: float, nobs: int, k_params: int) -> tuple[float, float]:
    if not np.isfinite(llf) or (nobs <= 0) or (k_params <= 0):
        return np.nan, np.nan
    aic_val = -2.0 * llf + 2.0 * k_params
    bic_val = -2.0 * llf + np.log(float(nobs)) * float(k_params)
    return float(aic_val), float(bic_val)

def extract_alpha(res) -> float:
    if res is None:
        return np.nan
    try:
        if hasattr(res, "params") and ("alpha" in res.params.index):
            return float(res.params.loc["alpha"])
    except Exception:
        pass
    try:
        fam = getattr(getattr(res, "model", None), "family", None)
        if fam is not None and hasattr(fam, "alpha"):
            return float(fam.alpha)
    except Exception:
        pass
    return np.nan

def fit_poisson(formula: str, data: pd.DataFrame):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(maxiter=200, disp=0)
    return r

def fit_nb_discrete(formula: str, data: pd.DataFrame):
    m = smf.negativebinomial(formula=formula, data=data)
    r = m.fit(disp=False, maxiter=200)
    return r

def robust_cluster(res, groups: pd.Series):
    try:
        return res.get_robustcov_results(cov_type="cluster", groups=groups)
    except Exception:
        return None

def fit_one_spec(spec_id: str, side: str, outcome: str, formula: str, family: str, data: pd.DataFrame, key_terms: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
    if family not in ["poisson", "negative_binomial"]:
        raise RuntimeError("family must be poisson or negative_binomial")

    base_res = None
    err = None
    try:
        if family == "poisson":
            base_res = fit_poisson(formula, data)
        else:
            base_res = fit_nb_discrete(formula, data)
    except Exception as e:
        err = str(e)

    if base_res is None:
        meta = pd.DataFrame([{
            "spec_id": spec_id,
            "side": side,
            "outcome": outcome,
            "family": family,
            "formula": formula,
            "nobs": np.nan,
            "k_params": np.nan,
            "llf": np.nan,
            "aic": np.nan,
            "bic": np.nan,
            "alpha": np.nan,
            "fit_error": err,
        }])
        coefs = pd.DataFrame([], columns=[
            "spec_id", "side", "outcome", "family", "term", "beta", "se_cluster", "pvalue", "is_key_term"
        ])
        return meta, coefs

    nobs = int(getattr(base_res, "nobs", np.nan))
    params = base_res.params
    k_params = int(len(params))
    llf = float(getattr(base_res, "llf", np.nan))
    aic_val, bic_val = aic_bic(llf, nobs, k_params)
    alpha = extract_alpha(base_res)

    groups = data[TEAM_COL]
    rob = robust_cluster(base_res, groups)

    if rob is None:
        se = getattr(base_res, "bse", pd.Series(index=params.index, data=np.nan)).astype(float)
        pv = getattr(base_res, "pvalues", pd.Series(index=params.index, data=np.nan)).astype(float)
    else:
        se = rob.bse.astype(float)
        pv = rob.pvalues.astype(float)

    coef_df = pd.DataFrame({
        "spec_id": spec_id,
        "side": side,
        "outcome": outcome,
        "family": family,
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": se.reindex(params.index).values.astype(float),
        "pvalue": pv.reindex(params.index).values.astype(float),
    })

    key_set = set(key_terms)
    coef_df["is_key_term"] = coef_df["term"].apply(lambda x: 1 if x in key_set else 0)

    meta = pd.DataFrame([{
        "spec_id": spec_id,
        "side": side,
        "outcome": outcome,
        "family": family,
        "formula": formula,
        "nobs": nobs,
        "k_params": k_params,
        "llf": llf,
        "aic": aic_val,
        "bic": bic_val,
        "alpha": alpha,
        "fit_error": None,
    }])

    return meta, coef_df

print("helpers ready")

helpers ready


We fit a focused grid of model variants, then we write AIC and BIC tables and a coefficients table into DuckDB and into outputs csv files

In [6]:
df = con.execute("SELECT * FROM step18_model_frame").df()
TEAM_COL = "team" if "team" in df.columns else "team_key"

FE_TEAM = f"C({TEAM_COL})"
FE_TIME = "C(season_week)"

control_common = [
    "short_week_flag_w",
    "days_rest_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "blowout_flag_w",
    "points_for",
    "points_against",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
    "Cumulative_Workload_Index_w",
]

control_common = [c for c in control_common if c in df.columns]

if "points_for" not in df.columns or "points_against" not in df.columns:
    raise RuntimeError("Missing points_for or points_against, rerun notebook 10 and rebuild step11 onward")

base_lags = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]
base_lags = [c for c in base_lags if c in df.columns]

specs = []

specs.append({
    "spec_id": "nonscore_s2d_with_lags",
    "family": None,
    "side": "both",
    "exposure_terms": [
        "shock_nonscore",
        "shock_x_blowout",
        "vol_nonscore_s2d_prior",
        "cum_shocks_nonscore_prior",
    ] + base_lags,
})

specs.append({
    "spec_id": "nonscore_s2d_no_lags",
    "family": None,
    "side": "both",
    "exposure_terms": [
        "shock_nonscore",
        "shock_x_blowout",
        "vol_nonscore_s2d_prior",
        "cum_shocks_nonscore_prior",
    ],
})

specs.append({
    "spec_id": "nonscore_roll4_with_lags",
    "family": None,
    "side": "both",
    "exposure_terms": [
        "shock_nonscore",
        "shock_x_blowout",
        "vol_nonscore_roll4_prior",
        "cum_shocks_nonscore_prior",
    ] + base_lags,
})

specs.append({
    "spec_id": "nonscore_roll4_no_lags",
    "family": None,
    "side": "both",
    "exposure_terms": [
        "shock_nonscore",
        "shock_x_blowout",
        "vol_nonscore_roll4_prior",
        "cum_shocks_nonscore_prior",
    ],
})

specs.append({
    "spec_id": "nonscore_s2d_lags_no_vol",
    "family": None,
    "side": "both",
    "exposure_terms": [
        "shock_nonscore",
        "shock_x_blowout",
        "cum_shocks_nonscore_prior",
    ] + base_lags,
})

if "shock_scorelinked" in df.columns:
    diag_lags = []
    specs.append({
        "spec_id": "scorelinked_s2d_diag_no_lags",
        "family": None,
        "side": "both",
        "exposure_terms": [
            "shock_scorelinked",
            "shock_x_blowout_scorelinked",
            "vol_scorelinked_s2d_prior",
            "cum_shocks_scorelinked_prior",
        ] + diag_lags,
    })

if "shock_all" in df.columns:
    diag_lags = []
    specs.append({
        "spec_id": "all_s2d_diag_no_lags",
        "family": None,
        "side": "both",
        "exposure_terms": [
            "shock_all",
            "shock_x_blowout_all",
            "vol_all_s2d_prior",
            "cum_shocks_all_prior",
        ] + diag_lags,
    })

def choose_family_from_step16(outcome: str) -> str:
    stats_cols = set(con.execute("SHOW TABLES").df()["name"].astype(str).tolist())

    if "step16_overdispersion_diagnostics" in stats_cols:
        d = con.execute("SELECT * FROM step16_overdispersion_diagnostics").df()
        cols = list(d.columns)
        low = {c.lower(): c for c in cols}

        outcome_col_candidates = ["outcome", "y", "depvar", "dependent", "response", "dv"]
        fam_col_candidates = ["chosen_family", "family", "selected_family", "best_family", "model_family"]

        out_col = next((low[c] for c in outcome_col_candidates if c in low), None)
        fam_col = next((low[c] for c in fam_col_candidates if c in low), None)

        if out_col is None:
            for c in cols:
                try:
                    if (d[c].astype(str) == str(outcome)).any():
                        out_col = c
                        break
                except Exception:
                    continue

        if fam_col is None:
            for c in cols:
                try:
                    vals = d[c].astype(str).str.lower().unique().tolist()
                    joined = " ".join(vals)
                    if ("poisson" in joined) or ("negative" in joined) or ("nb" in joined):
                        fam_col = c
                        break
                except Exception:
                    continue

        if (out_col is not None) and (fam_col is not None):
            row = d[d[out_col].astype(str) == str(outcome)]
            if len(row) == 1:
                fam_raw = str(row[fam_col].iloc[0]).lower()
                if ("negative" in fam_raw) or (fam_raw in ["nb", "neg_bin", "negativebinomial", "negative_binomial"]):
                    return "negative_binomial"
                if "poisson" in fam_raw:
                    return "poisson"

    y = df[outcome].astype(float)
    m = float(y.mean())
    v = float(y.var(ddof=1))
    if np.isfinite(m) and m > 0 and np.isfinite(v) and (v / m) >= 1.5:
        return "negative_binomial"
    return "poisson"

fam_def = choose_family_from_step16(OUTCOME_DEF)
fam_off = choose_family_from_step16(OUTCOME_OFF)

def build_formula(outcome: str, exposure_terms: list[str]) -> str:
    rhs = exposure_terms + control_common + [FE_TEAM, FE_TIME]
    rhs = [t for t in rhs if t in df.columns or t.startswith("C(")]
    return outcome + " ~ " + " + ".join(rhs)

meta_rows = []
coef_rows = []

for sp in specs:
    spec_id = sp["spec_id"]
    exposure_terms = sp["exposure_terms"]

    f_def = build_formula(OUTCOME_DEF, exposure_terms)
    f_off = build_formula(OUTCOME_OFF, exposure_terms)

    key_terms_def = exposure_terms
    key_terms_off = exposure_terms

    m_def, c_def = fit_one_spec(spec_id, "def", OUTCOME_DEF, f_def, fam_def, df, key_terms_def)
    m_off, c_off = fit_one_spec(spec_id, "off", OUTCOME_OFF, f_off, fam_off, df, key_terms_off)

    meta_rows.append(m_def)
    meta_rows.append(m_off)
    coef_rows.append(c_def)
    coef_rows.append(c_off)

meta_df = pd.concat(meta_rows, ignore_index=True)
coef_df = pd.concat(coef_rows, ignore_index=True)

meta_df = meta_df.sort_values(["outcome", "side", "bic"], na_position="last").reset_index(drop=True)

print("top specs by BIC for defense")
print(meta_df[(meta_df["side"] == "def")].head(10)[["spec_id", "family", "nobs", "aic", "bic", "fit_error"]])

print("top specs by BIC for offense")
print(meta_df[(meta_df["side"] == "off")].head(10)[["spec_id", "family", "nobs", "aic", "bic", "fit_error"]])

con.register("step18_meta_tmp", meta_df)
con.execute("CREATE OR REPLACE TABLE step18_model_selection_aic_bic AS SELECT * FROM step18_meta_tmp")
con.unregister("step18_meta_tmp")

con.register("step18_coef_tmp", coef_df)
con.execute("CREATE OR REPLACE TABLE step18_model_selection_coefficients AS SELECT * FROM step18_coef_tmp")
con.unregister("step18_coef_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)

meta_csv = out_dir / "step18_model_selection_aic_bic.csv"
coef_csv = out_dir / "step18_model_selection_coefficients.csv"

meta_df.to_csv(meta_csv, index=False)
coef_df.to_csv(coef_csv, index=False)

print("wrote duckdb table step18_model_selection_aic_bic")
print("wrote duckdb table step18_model_selection_coefficients")
print("wrote csv", meta_csv.resolve())
print("wrote csv", coef_csv.resolve())

top specs by BIC for defense
                        spec_id   family  nobs           aic           bic  \
0        nonscore_roll4_no_lags  poisson  5950  20805.188499  22551.577735   
1          nonscore_s2d_no_lags  poisson  5950  20806.041592  22552.430828   
2  scorelinked_s2d_diag_no_lags  poisson  5950  20806.194923  22552.584159   
3          all_s2d_diag_no_lags  poisson  5950  20808.143250  22554.532486   
4      nonscore_s2d_lags_no_vol  poisson  5950  20807.485216  22567.256745   
5      nonscore_roll4_with_lags  poisson  5950  20808.400090  22574.862766   
6        nonscore_s2d_with_lags  poisson  5950  20808.983821  22575.446496   

  fit_error  
0      None  
1      None  
2      None  
3      None  
4      None  
5      None  
6      None  
top specs by BIC for offense
                         spec_id   family  nobs           aic           bic  \
7         nonscore_roll4_no_lags  poisson  5950  20178.233429  21924.622665   
8   scorelinked_s2d_diag_no_lags  poisson  5950

Quick sanity check to confirm that the preferred spec selection excludes diagnostics and consistently picks the lowest BIC NonScore model for each side

In [7]:
meta_df = con.execute("SELECT * FROM step18_model_selection_aic_bic").df()

col_map = {c.lower(): c for c in meta_df.columns}
OUTCOME_COL = col_map.get("outcome", None)
if OUTCOME_COL is None:
    OUTCOME_COL = col_map.get("y", None)

if OUTCOME_COL is None:
    OUTCOME_COL = "outcome"
    meta_df[OUTCOME_COL] = meta_df["formula"].astype(str).str.split("~").str[0].str.strip()

def pick_preferred(side: str) -> pd.Series:
    m = meta_df[(meta_df["side"] == side) & (meta_df["fit_error"].isna())].copy()
    m["is_diag"] = m["spec_id"].astype(str).str.contains("scorelinked|all", case=False, regex=True)
    m = m[m["is_diag"] == False].copy()
    m = m.sort_values(["bic", "aic", "spec_id"], na_position="last").reset_index(drop=True)
    if len(m) == 0:
        raise RuntimeError(f"No successful NonScore specs for side {side}")
    return m.iloc[0]

pref_def = pick_preferred("def")
pref_off = pick_preferred("off")

preferred = pd.DataFrame([{
    "side": "def",
    "outcome": str(pref_def[OUTCOME_COL]),
    "family": str(pref_def["family"]),
    "spec_id": str(pref_def["spec_id"]),
    "formula": str(pref_def["formula"]),
    "aic": float(pref_def["aic"]),
    "bic": float(pref_def["bic"]),
}, {
    "side": "off",
    "outcome": str(pref_off[OUTCOME_COL]),
    "family": str(pref_off["family"]),
    "spec_id": str(pref_off["spec_id"]),
    "formula": str(pref_off["formula"]),
    "aic": float(pref_off["aic"]),
    "bic": float(pref_off["bic"]),
}])

print(preferred)

if (preferred["spec_id"] != "nonscore_roll4_no_lags").any():
    print("note preferred spec is not nonscore_roll4_no_lags for at least one side, verify step18_model_selection_aic_bic ordering")

con.register("step18_pref_tmp", preferred)
con.execute("CREATE OR REPLACE TABLE step18_preferred_model_specs AS SELECT * FROM step18_pref_tmp")
con.unregister("step18_pref_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)

pref_csv = out_dir / "step18_preferred_model_specs.csv"
preferred.to_csv(pref_csv, index=False)

print("wrote duckdb table step18_preferred_model_specs")
print("wrote csv", pref_csv.resolve())

  side         outcome   family                 spec_id  \
0  def  Inj_Def_Next_w  poisson  nonscore_roll4_no_lags   
1  off  Inj_Off_Next_w  poisson  nonscore_roll4_no_lags   

                                             formula           aic  \
0  Inj_Def_Next_w ~ shock_nonscore + shock_x_blow...  20805.188499   
1  Inj_Off_Next_w ~ shock_nonscore + shock_x_blow...  20178.233429   

            bic  
0  22551.577735  
1  21924.622665  
wrote duckdb table step18_preferred_model_specs
wrote csv /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/outputs/step18_preferred_model_specs.csv
