We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db or notebook parent folders to contain it")

con = duckdb.connect(str(DB_FILE), read_only=False)

BASE_TABLE = "step16_modeling_frame_nolookahead"

exists_df = con.execute(f"""
SELECT COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{BASE_TABLE}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {BASE_TABLE}, run notebook 16 first")

print("connected db", str(DB_FILE))
print("base table", BASE_TABLE)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
base table step16_modeling_frame_nolookahead


We build a modeling table that adds rolling volatility and prior season to date volatility and prior cumulative shock, and construct 'ScoreLinked' and 'All' diagnostics without changing the main 'NonScore'

In [2]:
df = con.execute(f"SELECT * FROM {BASE_TABLE}").df()

desc = con.execute(f"DESCRIBE {BASE_TABLE}").df()
cols = set(desc["column_name"].astype(str).tolist())

def pick_col(candidates: list[str], available: set[str], label: str) -> str:
    for c in candidates:
        if c in available:
            return c
    raise RuntimeError(f"Missing {label}, add its exact name into candidates, available columns include {sorted(list(available))[:40]}")

TEAM_COL = pick_col(["team", "team_key"], cols, "team id column")
SEASON_COL = pick_col(["season"], cols, "season column")
WEEK_COL = pick_col(["week"], cols, "week column")

if "season_week" not in df.columns:
    df["season_week"] = (df[SEASON_COL].astype(int) * 100 + df[WEEK_COL].astype(int)).astype(int)

if "load_nonscore" not in df.columns:
    load_candidates = ["ST_Load_NonScore_w", "ST_Load_NonScore", "ST_Load_NonScore_w"]
    load_col = pick_col(load_candidates, cols, "NonScore load column for volatility construction")
    df["load_nonscore"] = df[load_col].astype(float)

if "shock_nonscore" not in df.columns:
    shock_candidates = ["ST_Shock_NonScore_w", "shock_nonscore"]
    shock_col = pick_col(shock_candidates, cols, "NonScore shock column")
    df["shock_nonscore"] = df[shock_col].fillna(0).astype(int)

if "blowout_flag_w" not in df.columns:
    raise RuntimeError("Missing blowout_flag_w in the step16 frame, rerun notebook 10 and 11 then rebuild step16")

df = df.sort_values([TEAM_COL, SEASON_COL, WEEK_COL]).reset_index(drop=True)
g = df.groupby([TEAM_COL, SEASON_COL], sort=False)

df["cum_shocks_nonscore_prior"] = g["shock_nonscore"].cumsum().shift(1).fillna(0).astype(int)

df["vol_nonscore_s2d_prior"] = (
    g["load_nonscore"]
    .apply(lambda s: s.expanding().std(ddof=1).shift(1))
    .reset_index(level=[0, 1], drop=True)
    .fillna(0.0)
    .astype(float)
)

df["vol_nonscore_roll4_prior"] = (
    g["load_nonscore"]
    .apply(lambda s: s.shift(1).rolling(4, min_periods=2).std(ddof=1))
    .reset_index(level=[0, 1], drop=True)
    .fillna(df["vol_nonscore_s2d_prior"])
    .astype(float)
)

for k in [1, 2, 3]:
    col = f"ST_Shock_NonScore_w_minus_{k}"
    if col not in df.columns:
        df[col] = g["shock_nonscore"].shift(k).fillna(0).astype(int)

if "shock_x_blowout" not in df.columns:
    df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

MODEL_VIEW = "team_week_panel_nextweek_model"
mv_exists = con.execute(f"""
SELECT COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{MODEL_VIEW}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()
if int(mv_exists["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to recreate it")

mv_desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
mv_cols = set(mv_desc["column_name"].astype(str).tolist())

scorelinked_col = None
all_col = None
if "ST_Load_ScoreLinked_w" in cols:
    scorelinked_col = "ST_Load_ScoreLinked_w"
if "ST_Load_All_w" in cols:
    all_col = "ST_Load_All_w"

need_join = (scorelinked_col is None) or (all_col is None)
if need_join:
    if "ST_Load_ScoreLinked_w" in mv_cols:
        scorelinked_col = "ST_Load_ScoreLinked_w"
    if "ST_Load_All_w" in mv_cols:
        all_col = "ST_Load_All_w"

if scorelinked_col is not None or all_col is not None:
    join_cols = [SEASON_COL, WEEK_COL, TEAM_COL]
    sel = [f"a.*"]
    if scorelinked_col is not None and scorelinked_col not in cols:
        sel.append(f"b.{scorelinked_col} AS {scorelinked_col}")
    if all_col is not None and all_col not in cols:
        sel.append(f"b.{all_col} AS {all_col}")

    if len(sel) > 1:
        df = con.execute(f"""
        SELECT {", ".join(sel)}
        FROM df a
        LEFT JOIN {MODEL_VIEW} b
        ON a.{SEASON_COL} = b.{SEASON_COL}
        AND a.{WEEK_COL} = b.{WEEK_COL}
        AND a.{TEAM_COL} = b.{TEAM_COL}
        """).df()

def add_diag_prefix(load_col: str, prefix: str):
    df[f"load_{prefix}"] = df[load_col].astype(float)

    mean_prior = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"load_{prefix}"]
        .apply(lambda s: s.expanding().mean().shift(1))
        .reset_index(level=[0, 1], drop=True)
    )
    sd_prior = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"load_{prefix}"]
        .apply(lambda s: s.expanding().std(ddof=1).shift(1))
        .reset_index(level=[0, 1], drop=True)
    )

    z_prior = (df[f"load_{prefix}"] - mean_prior) / sd_prior
    df[f"shock_{prefix}"] = (z_prior >= 1).fillna(False).astype(int)
    df[f"shock_x_blowout_{prefix}"] = (df[f"shock_{prefix}"] * df["blowout_flag_w"].fillna(0).astype(int)).astype(int)

    df[f"vol_{prefix}_s2d_prior"] = sd_prior.fillna(0.0).astype(float)
    df[f"vol_{prefix}_roll4_prior"] = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"load_{prefix}"]
        .apply(lambda s: s.shift(1).rolling(4, min_periods=2).std(ddof=1))
        .reset_index(level=[0, 1], drop=True)
        .fillna(df[f"vol_{prefix}_s2d_prior"])
        .astype(float)
    )
    df[f"cum_shocks_{prefix}_prior"] = (
        df.groupby([TEAM_COL, SEASON_COL], sort=False)[f"shock_{prefix}"]
        .cumsum()
        .shift(1)
        .fillna(0)
        .astype(int)
    )

if scorelinked_col is not None and scorelinked_col in df.columns:
    add_diag_prefix(scorelinked_col, "scorelinked")

if all_col is not None and all_col in df.columns:
    add_diag_prefix(all_col, "all")

con.register("step18_model_frame_tmp", df)
con.execute("CREATE OR REPLACE TABLE step18_model_frame AS SELECT * FROM step18_model_frame_tmp")
con.unregister("step18_model_frame_tmp")

print("wrote duckdb table step18_model_frame")
print("rows", len(df))

wrote duckdb table step18_model_frame
rows 5950


Quick sanity check to confirm that every candidate spec will use a consistent non missing modeling sample and the key exposure variables have variation

In [3]:
df = con.execute("SELECT * FROM step18_model_frame").df()

required = [
    "shock_nonscore",
    "shock_x_blowout",
    "cum_shocks_nonscore_prior",
    "vol_nonscore_s2d_prior",
    "vol_nonscore_roll4_prior",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
    "blowout_flag_w",
]

missing = [c for c in required if c not in df.columns]
if missing:
    raise RuntimeError(f"Missing required step18 columns, {missing}")

nulls = df[required].isna().sum().sort_values(ascending=False)
print("null counts in required columns")
print(nulls)

print("shock_nonscore value counts")
print(df["shock_nonscore"].value_counts(dropna=False).sort_index())

print("vol_nonscore_s2d_prior min max")
print(float(df["vol_nonscore_s2d_prior"].min()), float(df["vol_nonscore_s2d_prior"].max()))

print("cum_shocks_nonscore_prior min max")
print(int(df["cum_shocks_nonscore_prior"].min()), int(df["cum_shocks_nonscore_prior"].max()))

null counts in required columns
shock_nonscore                 0
shock_x_blowout                0
cum_shocks_nonscore_prior      0
vol_nonscore_s2d_prior         0
vol_nonscore_roll4_prior       0
ST_Shock_NonScore_w_minus_1    0
ST_Shock_NonScore_w_minus_2    0
ST_Shock_NonScore_w_minus_3    0
blowout_flag_w                 0
dtype: int64
shock_nonscore value counts
shock_nonscore
0    4943
1    1007
Name: count, dtype: int64
vol_nonscore_s2d_prior min max
0.0 9.192388155425117
cum_shocks_nonscore_prior min max
0 7


We define consistent AIC and BIC calculation helpers to avoid common pitfalls, such as using deviance-based BIC or failing to account for the fact that robust clustering affects inference without altering the likelihood criteria

In [4]:
df = con.execute("SELECT * FROM step18_model_frame").df()

TEAM_COL = "team" if "team" in df.columns else "team_key"
SEASON_COL = "season"
WEEK_COL = "week"

OUTCOME_DEF = "Inj_Def_Next_w"
OUTCOME_OFF = "Inj_Off_Next_w"

for out in [OUTCOME_DEF, OUTCOME_OFF]:
    if out not in df.columns:
        raise RuntimeError(f"Missing outcome {out}, rerun notebook 9 through 11 then rebuild step16 and step18")

def aic_bic(llf: float, nobs: int, k_params: int) -> tuple[float, float]:
    if not np.isfinite(llf) or (nobs <= 0) or (k_params <= 0):
        return np.nan, np.nan
    aic_val = -2.0 * llf + 2.0 * k_params
    bic_val = -2.0 * llf + np.log(float(nobs)) * float(k_params)
    return float(aic_val), float(bic_val)

def extract_alpha(res) -> float:
    if res is None:
        return np.nan
    try:
        if hasattr(res, "params") and ("alpha" in res.params.index):
            return float(res.params.loc["alpha"])
    except Exception:
        pass
    try:
        fam = getattr(getattr(res, "model", None), "family", None)
        if fam is not None and hasattr(fam, "alpha"):
            return float(fam.alpha)
    except Exception:
        pass
    return np.nan

def fit_poisson(formula: str, data: pd.DataFrame):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(maxiter=200, disp=0)
    return r

def fit_nb_discrete(formula: str, data: pd.DataFrame):
    m = smf.negativebinomial(formula=formula, data=data)
    r = m.fit(disp=False, maxiter=200)
    return r

def robust_cluster(res, groups: pd.Series):
    try:
        return res.get_robustcov_results(cov_type="cluster", groups=groups)
    except Exception:
        return None

def fit_one_spec(spec_id: str, side: str, outcome: str, formula: str, family: str, data: pd.DataFrame, key_terms: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
    if family not in ["poisson", "negative_binomial"]:
        raise RuntimeError("family must be poisson or negative_binomial")

    base_res = None
    err = None
    try:
        if family == "poisson":
            base_res = fit_poisson(formula, data)
        else:
            base_res = fit_nb_discrete(formula, data)
    except Exception as e:
        err = str(e)

    if base_res is None:
        meta = pd.DataFrame([{
            "spec_id": spec_id,
            "side": side,
            "outcome": outcome,
            "family": family,
            "formula": formula,
            "nobs": np.nan,
            "k_params": np.nan,
            "llf": np.nan,
            "aic": np.nan,
            "bic": np.nan,
            "alpha": np.nan,
            "fit_error": err,
        }])
        coefs = pd.DataFrame([], columns=[
            "spec_id", "side", "outcome", "family", "term", "beta", "se_cluster", "pvalue", "is_key_term"
        ])
        return meta, coefs

    nobs = int(getattr(base_res, "nobs", np.nan))
    params = base_res.params
    k_params = int(len(params))
    llf = float(getattr(base_res, "llf", np.nan))
    aic_val, bic_val = aic_bic(llf, nobs, k_params)
    alpha = extract_alpha(base_res)

    groups = data[TEAM_COL]
    rob = robust_cluster(base_res, groups)

    if rob is None:
        se = getattr(base_res, "bse", pd.Series(index=params.index, data=np.nan)).astype(float)
        pv = getattr(base_res, "pvalues", pd.Series(index=params.index, data=np.nan)).astype(float)
    else:
        se = rob.bse.astype(float)
        pv = rob.pvalues.astype(float)

    coef_df = pd.DataFrame({
        "spec_id": spec_id,
        "side": side,
        "outcome": outcome,
        "family": family,
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": se.reindex(params.index).values.astype(float),
        "pvalue": pv.reindex(params.index).values.astype(float),
    })

    key_set = set(key_terms)
    coef_df["is_key_term"] = coef_df["term"].apply(lambda x: 1 if x in key_set else 0)

    meta = pd.DataFrame([{
        "spec_id": spec_id,
        "side": side,
        "outcome": outcome,
        "family": family,
        "formula": formula,
        "nobs": nobs,
        "k_params": k_params,
        "llf": llf,
        "aic": aic_val,
        "bic": bic_val,
        "alpha": alpha,
        "fit_error": None,
    }])

    return meta, coef_df

print("helpers ready")

helpers ready
