We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [7]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb or nflpa.duckdb near this notebook")

con = duckdb.connect(str(DB_FILE), read_only=False)

MODEL_VIEW = "team_week_panel_nextweek_model"

exists_df = con.execute(f"""
SELECT
  COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{MODEL_VIEW}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to create the model view")

print("connected db", str(DB_FILE))
print("model view", MODEL_VIEW)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
model view team_week_panel_nextweek_model


Quick sanity check to confirm that the modeling view only contains next week eligible rows, has non null offensive and defensive outcomes, and has unique team week keys

In [3]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week,
  SUM(CASE WHEN Inj_Def_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_def,
  SUM(CASE WHEN Inj_Off_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_off
FROM {MODEL_VIEW}
""").df()

desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
cols = set(desc["column_name"].astype(str).tolist())

if "team" in cols:
    TEAM_COL = "team"
elif "team_key" in cols:
    TEAM_COL = "team_key"
else:
    raise RuntimeError(f"No team id column found in {MODEL_VIEW}, expected team or team_key")

con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_any,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
)
""").df()

Unnamed: 0,dup_rows
0,0


We build a single, clean modeling frame table with robust column detection, consistent names, and consistent missing value handling

In [4]:
def pick_first_present(candidates: list[str], present: set[str]) -> str | None:
    for c in candidates:
        if c in present:
            return c
    return None

desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
present_cols = set(desc["column_name"].astype(str).tolist())

SEASON_COL = pick_first_present(["season"], present_cols)
WEEK_COL = pick_first_present(["week"], present_cols)
TEAM_RAW_COL = pick_first_present(["team", "team_key"], present_cols)

if SEASON_COL is None or WEEK_COL is None:
    raise RuntimeError("Missing season or week columns in model view")
if TEAM_RAW_COL is None:
    raise RuntimeError("Missing team column, expected team or team_key in model view")

OUTCOME_DEF = "Inj_Def_Next_w"
OUTCOME_OFF = "Inj_Off_Next_w"
if OUTCOME_DEF not in present_cols or OUTCOME_OFF not in present_cols:
    raise RuntimeError("Missing Inj_Def_Next_w or Inj_Off_Next_w in model view")

LAG_CANDIDATES = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]
LAG_COLS = [c for c in LAG_CANDIDATES if c in present_cols]

SHOCK_COL_MAIN = pick_first_present(
    ["ST_Shock_NonScore_w", "st_shock_nonscore_w", "shock_nonscore"],
    present_cols
)

POINTS_FOR_COL = pick_first_present(["points_for_w", "points_for"], present_cols)
POINTS_AGAINST_COL = pick_first_present(["points_against_w", "points_against"], present_cols)
SCORE_DIFF_COL = pick_first_present(["score_diff_w", "score_diff"], present_cols)
OFF_YPP_COL = pick_first_present(["off_yards_per_play_w", "Off_yards_per_play_w"], present_cols)
CWI_COL = pick_first_present(["Cumulative_Workload_Index_w", "cumulative_workload_index_w"], present_cols)

required_core = [
    SHOCK_COL_MAIN,
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    SCORE_DIFF_COL,
    OFF_YPP_COL,
    CWI_COL,
]
if any(c is None for c in required_core):
    raise RuntimeError("Missing one or more required columns for steps 16 and 17, check your step 10 and step 11 outputs")

select_cols = [
    SEASON_COL,
    WEEK_COL,
    TEAM_RAW_COL,
    OUTCOME_DEF,
    OUTCOME_OFF,
    "Inj_Def_Last_w",
    "Inj_Off_Last_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS + [
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    SCORE_DIFF_COL,
    OFF_YPP_COL,
    CWI_COL,
]

df = con.execute(f"SELECT {', '.join(select_cols)} FROM {MODEL_VIEW}").df()

rename_map = {}

if TEAM_RAW_COL != "team":
    rename_map[TEAM_RAW_COL] = "team"
if POINTS_FOR_COL != "points_for":
    rename_map[POINTS_FOR_COL] = "points_for"
if POINTS_AGAINST_COL != "points_against":
    rename_map[POINTS_AGAINST_COL] = "points_against"
if SCORE_DIFF_COL != "score_diff_w":
    rename_map[SCORE_DIFF_COL] = "score_diff_w"
if OFF_YPP_COL != "off_yards_per_play_w":
    rename_map[OFF_YPP_COL] = "off_yards_per_play_w"
if CWI_COL != "Cumulative_Workload_Index_w":
    rename_map[CWI_COL] = "Cumulative_Workload_Index_w"
if SHOCK_COL_MAIN != "ST_Shock_NonScore_w":
    rename_map[SHOCK_COL_MAIN] = "ST_Shock_NonScore_w"

df = df.rename(columns=rename_map)

df["team"] = df["team"].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df["season_week"] = (df[SEASON_COL] * 100 + df[WEEK_COL]).astype(int)

df["blowout_flag_w"] = df["blowout_flag_w"].fillna(0).astype(int)
df["short_week_flag_w"] = df["short_week_flag_w"].fillna(0).astype(int)
df["bye_last_week_flag_w"] = df["bye_last_week_flag_w"].fillna(0).astype(int)
df["home_flag_w"] = df["home_flag_w"].fillna(0).astype(int)

df["Inj_Def_Last_w"] = df["Inj_Def_Last_w"].fillna(0).astype(float)
df["Inj_Off_Last_w"] = df["Inj_Off_Last_w"].fillna(0).astype(float)

df["ST_Shock_NonScore_w"] = df["ST_Shock_NonScore_w"].fillna(0).astype(int)
df["shock_nonscore"] = df["ST_Shock_NonScore_w"].astype(int)
df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"]).astype(int)

for c in LAG_COLS:
    df[c] = df[c].fillna(0).astype(int)

must_not_be_null = [
    OUTCOME_DEF,
    OUTCOME_OFF,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "points_for",
    "points_against",
    "score_diff_w",
    "off_yards_per_play_w",
    "Cumulative_Workload_Index_w",
]

before = len(df)
df = df.dropna(subset=must_not_be_null).reset_index(drop=True)
after = len(df)

print("rows before dropna", before)
print("rows after dropna", after)

con.register("step16_modeling_frame_tmp", df)
con.execute("CREATE OR REPLACE TABLE step16_modeling_frame AS SELECT * FROM step16_modeling_frame_tmp")
con.unregister("step16_modeling_frame_tmp")

print("wrote duckdb table step16_modeling_frame")
df.head(3)

rows before dropna 5950
rows after dropna 5950
wrote duckdb table step16_modeling_frame


Unnamed: 0,season,week,team,Inj_Def_Next_w,Inj_Off_Next_w,Inj_Def_Last_w,Inj_Off_Last_w,blowout_flag_w,short_week_flag_w,bye_last_week_flag_w,...,ST_Shock_NonScore_w_minus_2,ST_Shock_NonScore_w_minus_3,points_for,points_against,score_diff_w,off_yards_per_play_w,Cumulative_Workload_Index_w,season_week,shock_nonscore,shock_x_blowout
0,2012,1,ATL,2.0,2.0,0.0,0.0,1,0,0,...,0,0,40,24,16,6.836364,-3.940011,201201,0,0
1,2012,2,ATL,3.0,2.0,2.0,2.0,0,0,0,...,0,0,27,21,6,4.412698,-3.638251,201202,0,0
2,2012,3,ATL,2.0,2.0,3.0,2.0,1,0,0,...,0,0,27,3,24,5.565217,-2.938346,201203,0,0


We compute the first pass mean and variance checks for both outcomes so that we can see unconditional overdispersion before running model based tests

In [5]:
def outcome_dispersion_stats(y: pd.Series) -> dict:
    y = y.astype(float)
    mean = float(y.mean())
    var = float(y.var(ddof=1))
    share_zero = float((y == 0).mean())
    return {
        "mean": mean,
        "var": var,
        "var_over_mean": (var / mean) if mean > 0 else np.nan,
        "share_zero": share_zero,
        "max": float(y.max()),
    }

stats_def = outcome_dispersion_stats(df[OUTCOME_DEF])
stats_off = outcome_dispersion_stats(df[OUTCOME_OFF])

print("defense outcome", OUTCOME_DEF)
for k in ["mean", "var", "var_over_mean", "share_zero", "max"]:
    print(k, stats_def[k])

print()
print("offense outcome", OUTCOME_OFF)
for k in ["mean", "var", "var_over_mean", "share_zero", "max"]:
    print(k, stats_off[k])

defense outcome Inj_Def_Next_w
mean 2.083865546218487
var 2.380761656150105
var_over_mean 1.1424737361152615
share_zero 0.15529411764705883
max 10.0

offense outcome Inj_Off_Next_w
mean 1.9201680672268908
var 2.1612169830110557
var_over_mean 1.125535321568121
share_zero 0.17714285714285713
max 9.0


We fit Poisson and Negative Binomial versions of Models A and B using the same predictor blocks and fixed effects structure and store the preferred script spec that matches the existing selection rule

In [8]:
FE_TEAM = "C(team)"
FE_TIME = "C(season_week)"
cluster_groups = df["team"]

OUTCOME_DEF_USED = OUTCOME_DEF
OUTCOME_OFF_USED = OUTCOME_OFF

exposure_terms = [
    "shock_nonscore",
    "shock_x_blowout",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS

control_terms_base_def = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Def_Last_w",
    "Cumulative_Workload_Index_w",
]

control_terms_base_off = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Off_Last_w",
    "Cumulative_Workload_Index_w",
]

script_specs = [
    ("points_for_diff", ["points_for", "score_diff_w"]),
    ("points_against_diff", ["points_against", "score_diff_w"]),
    ("points_for_against", ["points_for", "points_against"]),
]

preferred_order = ["points_for_diff", "points_against_diff", "points_for_against"]

def build_formula(outcome: str, base_controls: list[str], script_terms: list[str]) -> str:
    rhs = exposure_terms + base_controls + script_terms + [FE_TEAM, FE_TIME]
    return outcome + " ~ " + " + ".join(rhs)

def fit_count_model_poisson_glm(formula: str, data: pd.DataFrame, groups: pd.Series, maxiter: int = 200):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(maxiter=maxiter, cov_type="cluster", cov_kwds={"groups": groups})
    return r

def estimate_nb2_alpha_moments(y: np.ndarray, mu: np.ndarray) -> float:
    y = y.astype(float)
    mu = mu.astype(float)
    den = float(np.sum(mu ** 2))
    if den <= 0:
        return 1e-8
    num = float(np.sum((y - mu) ** 2 - mu))
    alpha = num / den
    if not np.isfinite(alpha):
        alpha = 1e-8
    alpha = float(max(alpha, 1e-8))
    alpha = float(min(alpha, 50.0))
    return alpha

def fit_count_model_negative_binomial_glm_nb2(formula: str, data: pd.DataFrame, groups: pd.Series, alpha: float, maxiter: int = 200):
    fam = sm.families.NegativeBinomial(alpha=alpha)
    m = smf.glm(formula=formula, data=data, family=fam)

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=HessianInversionWarning)
        r = m.fit(maxiter=maxiter, cov_type="cluster", cov_kwds={"groups": groups})

    return r

def fit_model_grid(outcome: str, base_controls: list[str]) -> tuple[str, str, object, object]:
    fits = []
    for tag, script_terms in script_specs:
        f = build_formula(outcome, base_controls, script_terms)

        pois = None
        nb = None

        try:
            pois = fit_count_model_poisson_glm(f, df, cluster_groups)
        except Exception as e:
            print("poisson failed", outcome, tag, str(e))
            continue

        try:
            y = df[outcome].to_numpy()
            mu = np.asarray(pois.mu)
            alpha_hat = estimate_nb2_alpha_moments(y, mu)

            nb = fit_count_model_negative_binomial_glm_nb2(
                f,
                df,
                cluster_groups,
                alpha=alpha_hat,
                maxiter=200,
            )

            nb._alpha_hat_mom = alpha_hat
        except Exception as e:
            print("negative binomial glm failed", outcome, tag, str(e))

        fits.append((tag, f, pois, nb))
        print("fit ok", outcome, tag)

    if len(fits) == 0:
        raise RuntimeError(f"No specifications fit successfully for {outcome}")

    fits_sorted = sorted(
        fits,
        key=lambda x: preferred_order.index(x[0]) if x[0] in preferred_order else 999
    )
    return fits_sorted[0]

spec_tag_def, formula_def_used, pois_def, nb_def = fit_model_grid(OUTCOME_DEF_USED, control_terms_base_def)
spec_tag_off, formula_off_used, pois_off, nb_off = fit_model_grid(OUTCOME_OFF_USED, control_terms_base_off)

print()
print("selected Model A spec", spec_tag_def)
print(formula_def_used)
print()
print("selected Model B spec", spec_tag_off)
print(formula_off_used)

if nb_def is not None:
    print()
    print("defense nb alpha hat", float(getattr(nb_def, "_alpha_hat_mom", np.nan)))

if nb_off is not None:
    print()
    print("offense nb alpha hat", float(getattr(nb_off, "_alpha_hat_mom", np.nan)))

fit ok Inj_Def_Next_w points_for_diff
fit ok Inj_Def_Next_w points_against_diff
fit ok Inj_Def_Next_w points_for_against
fit ok Inj_Off_Next_w points_for_diff
fit ok Inj_Off_Next_w points_against_diff
fit ok Inj_Off_Next_w points_for_against

selected Model A spec points_for_diff
Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + offensive_snaps_w + defensive_snaps_w + blowout_flag_w + short_week_flag_w + bye_last_week_flag_w + home_flag_w + off_yards_per_play_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + points_for + score_diff_w + C(team) + C(season_week)

selected Model B spec points_for_diff
Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + offensive_snaps_w + defensive_snaps_w + blowout_flag_w + short_week_fla

We run model based overdispersion diagnostics for each Poisson fit using Pearson dispersion and an approximate z test so the decision is not based only on unconditional variance

In [9]:
def poisson_overdispersion_diagnostics(res) -> dict:
    chi2 = float(res.pearson_chi2)
    df_resid = float(res.df_resid)
    ratio = chi2 / df_resid if df_resid > 0 else np.nan

    z = (chi2 - df_resid) / np.sqrt(2.0 * df_resid) if df_resid > 0 else np.nan
    p = float(2.0 * (1.0 - stats.norm.cdf(abs(z)))) if np.isfinite(z) else np.nan

    dev = float(res.deviance)
    dev_ratio = dev / df_resid if df_resid > 0 else np.nan

    return {
        "pearson_chi2": chi2,
        "df_resid": df_resid,
        "pearson_dispersion": ratio,
        "pearson_z": float(z) if np.isfinite(z) else np.nan,
        "pearson_pvalue": p,
        "deviance": dev,
        "deviance_dispersion": dev_ratio,
        "aic": float(getattr(res, "aic", np.nan)),
        "bic": float(getattr(res, "bic", np.nan)),
        "llf": float(getattr(res, "llf", np.nan)),
        "nobs": int(getattr(res, "nobs", np.nan)),
    }

diag_def = poisson_overdispersion_diagnostics(pois_def)
diag_off = poisson_overdispersion_diagnostics(pois_off)

print("poisson overdispersion diagnostics, defense")
for k in ["pearson_dispersion", "pearson_z", "pearson_pvalue", "deviance_dispersion", "aic", "bic", "llf", "nobs"]:
    print(k, diag_def[k])

print()
print("poisson overdispersion diagnostics, offense")
for k in ["pearson_dispersion", "pearson_z", "pearson_pvalue", "deviance_dispersion", "aic", "bic", "llf", "nobs"]:
    print(k, diag_off[k])

poisson overdispersion diagnostics, defense
pearson_dispersion 1.0699401873476033
pearson_z 3.729196098233154
pearson_pvalue 0.00019209162261613066
deviance_dispersion 1.205507707915455
aic 20850.174345191903
bic 22616.637020806378
llf -10161.087172595951
nobs 5950

poisson overdispersion diagnostics, offense
pearson_dispersion 1.045766743017786
pearson_z 2.440273124270023
pearson_pvalue 0.014676161275462452
deviance_dispersion 1.1981689973697445
aic 20219.032919018013
bic 21985.495594632488
llf -9845.516459509006
nobs 5950


We formalize the family choice rule and export a diagnostics table plus the selected family results for Models A and B to DuckDB and csv so that the later steps always reference a single preferred count model per side

In [10]:
def extract_nb_alpha(res) -> float:
    if res is None:
        return np.nan
    try:
        if "alpha" in res.params.index:
            return float(res.params["alpha"])
    except Exception:
        pass
    return np.nan

alpha_def = extract_nb_alpha(nb_def)
alpha_off = extract_nb_alpha(nb_off)

nb_meta_def = {
    "aic": float(getattr(nb_def, "aic", np.nan)) if nb_def is not None else np.nan,
    "bic": float(getattr(nb_def, "bic", np.nan)) if nb_def is not None else np.nan,
    "llf": float(getattr(nb_def, "llf", np.nan)) if nb_def is not None else np.nan,
    "nobs": int(getattr(nb_def, "nobs", np.nan)) if nb_def is not None else np.nan,
    "alpha": alpha_def,
}
nb_meta_off = {
    "aic": float(getattr(nb_off, "aic", np.nan)) if nb_off is not None else np.nan,
    "bic": float(getattr(nb_off, "bic", np.nan)) if nb_off is not None else np.nan,
    "llf": float(getattr(nb_off, "llf", np.nan)) if nb_off is not None else np.nan,
    "nobs": int(getattr(nb_off, "nobs", np.nan)) if nb_off is not None else np.nan,
    "alpha": alpha_off,
}

def choose_family(uncond_stats: dict, pois_diag: dict, nb_res) -> tuple[str, str]:
    var_over_mean = float(uncond_stats.get("var_over_mean", np.nan))
    pearson_disp = float(pois_diag.get("pearson_dispersion", np.nan))
    pval = float(pois_diag.get("pearson_pvalue", np.nan))

    overdisp_uncond = np.isfinite(var_over_mean) and (var_over_mean >= 1.5)
    overdisp_model = np.isfinite(pearson_disp) and np.isfinite(pval) and (pearson_disp >= 1.2) and (pval < 0.05)

    if (overdisp_uncond or overdisp_model) and (nb_res is not None):
        return "negative_binomial", "overdispersion flagged by unconditional or model based test"
    return "poisson", "no strong overdispersion signal, or NB failed"

choice_def, reason_def = choose_family(stats_def, diag_def, nb_def)
choice_off, reason_off = choose_family(stats_off, diag_off, nb_off)

print("family choice defense", choice_def)
print("reason defense", reason_def)
print()
print("family choice offense", choice_off)
print("reason offense", reason_off)

def tidy_count_res(res, model_name: str, outcome_name: str, spec_tag: str, key_terms: list[str]) -> pd.DataFrame:
    params = res.params.copy()
    bse = res.bse.copy()
    pvals = res.pvalues.copy()

    out = pd.DataFrame({
        "model": model_name,
        "spec_tag": spec_tag,
        "outcome": outcome_name,
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": bse.values.astype(float),
        "pvalue": pvals.values.astype(float),
    })

    out["nobs"] = int(getattr(res, "nobs", np.nan))
    out["aic"] = float(getattr(res, "aic", np.nan))
    out["bic"] = float(getattr(res, "bic", np.nan))
    out["llf"] = float(getattr(res, "llf", np.nan))

    out["irr"] = np.exp(out["beta"].astype(float))
    out["irr_ci_lo"] = np.exp(out["beta"].astype(float) - 1.96 * out["se_cluster"].astype(float))
    out["irr_ci_hi"] = np.exp(out["beta"].astype(float) + 1.96 * out["se_cluster"].astype(float))

    key_keep = set(key_terms)
    out["is_key_term"] = out["term"].apply(lambda x: 1 if x in key_keep else 0)
    return out

key_terms = exposure_terms

selected_def_res = nb_def if choice_def == "negative_binomial" else pois_def
selected_off_res = nb_off if choice_off == "negative_binomial" else pois_off

selected_def_name = f"{choice_def}_modelA_selected"
selected_off_name = f"{choice_off}_modelB_selected"

selected_def_df = tidy_count_res(selected_def_res, selected_def_name, OUTCOME_DEF_USED, spec_tag_def, key_terms)
selected_off_df = tidy_count_res(selected_off_res, selected_off_name, OUTCOME_OFF_USED, spec_tag_off, key_terms)

final_selected = pd.concat([selected_def_df, selected_off_df], ignore_index=True)

diag_rows = []
diag_rows.append({
    "side": "defense",
    "spec_tag": spec_tag_def,
    "formula_used": formula_def_used,
    "outcome_mean": stats_def["mean"],
    "outcome_var": stats_def["var"],
    "outcome_var_over_mean": stats_def["var_over_mean"],
    "poisson_pearson_dispersion": diag_def["pearson_dispersion"],
    "poisson_pearson_pvalue": diag_def["pearson_pvalue"],
    "poisson_deviance_dispersion": diag_def["deviance_dispersion"],
    "poisson_aic": diag_def["aic"],
    "poisson_bic": diag_def["bic"],
    "nb_aic": nb_meta_def["aic"],
    "nb_bic": nb_meta_def["bic"],
    "nb_alpha": nb_meta_def["alpha"],
    "family_choice": choice_def,
    "choice_reason": reason_def,
})
diag_rows.append({
    "side": "offense",
    "spec_tag": spec_tag_off,
    "formula_used": formula_off_used,
    "outcome_mean": stats_off["mean"],
    "outcome_var": stats_off["var"],
    "outcome_var_over_mean": stats_off["var_over_mean"],
    "poisson_pearson_dispersion": diag_off["pearson_dispersion"],
    "poisson_pearson_pvalue": diag_off["pearson_pvalue"],
    "poisson_deviance_dispersion": diag_off["deviance_dispersion"],
    "poisson_aic": diag_off["aic"],
    "poisson_bic": diag_off["bic"],
    "nb_aic": nb_meta_off["aic"],
    "nb_bic": nb_meta_off["bic"],
    "nb_alpha": nb_meta_off["alpha"],
    "family_choice": choice_off,
    "choice_reason": reason_off,
})

diag_df = pd.DataFrame(diag_rows)

con.register("step16_overdisp_tmp", diag_df)
con.execute("CREATE OR REPLACE TABLE step16_overdispersion_diagnostics AS SELECT * FROM step16_overdisp_tmp")
con.unregister("step16_overdisp_tmp")

con.register("step16_selected_tmp", final_selected)
con.execute("CREATE OR REPLACE TABLE step16_selected_count_results AS SELECT * FROM step16_selected_tmp")
con.unregister("step16_selected_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)

diag_csv = out_dir / "step16_overdispersion_diagnostics.csv"
sel_csv = out_dir / "step16_selected_count_results.csv"

diag_df.to_csv(diag_csv, index=False)
final_selected.to_csv(sel_csv, index=False)

print("wrote duckdb table step16_overdispersion_diagnostics")
print("wrote duckdb table step16_selected_count_results")
print("wrote csv", diag_csv.resolve())
print("wrote csv", sel_csv.resolve())

diag_df

family choice defense poisson
reason defense no strong overdispersion signal, or NB failed

family choice offense poisson
reason offense no strong overdispersion signal, or NB failed
wrote duckdb table step16_overdispersion_diagnostics
wrote duckdb table step16_selected_count_results
wrote csv /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/outputs/step16_overdispersion_diagnostics.csv
wrote csv /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/outputs/step16_selected_count_results.csv


Unnamed: 0,side,spec_tag,formula_used,outcome_mean,outcome_var,outcome_var_over_mean,poisson_pearson_dispersion,poisson_pearson_pvalue,poisson_deviance_dispersion,poisson_aic,poisson_bic,nb_aic,nb_bic,nb_alpha,family_choice,choice_reason
0,defense,points_for_diff,Inj_Def_Next_w ~ shock_nonscore + shock_x_blow...,2.083866,2.380762,1.142474,1.06994,0.000192,1.205508,20850.174345,22616.637021,20849.514123,22615.976799,,poisson,"no strong overdispersion signal, or NB failed"
1,offense,points_for_diff,Inj_Off_Next_w ~ shock_nonscore + shock_x_blow...,1.920168,2.161217,1.125535,1.045767,0.014676,1.198169,20219.032919,21985.495595,20219.033728,21985.496404,,poisson,"no strong overdispersion signal, or NB failed"
