We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path
import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb under the repo root")

con = duckdb.connect(str(DB_FILE))

SEASON_COL = "season"
WEEK_COL = "week"
TEAM_COL = "team"

MODEL_VIEW = "team_week_panel_nextweek_model"

existing = set(con.execute("SHOW TABLES").df()["name"].tolist())
if MODEL_VIEW not in existing:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to create the model view")

print("connected db", str(DB_FILE))

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


Quick sanity check to confirm that 'has_next_week' is always 1 in the model view and that the view has unique season week team keys

In [2]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week
FROM {MODEL_VIEW}
""").df()

con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_key,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
) d
""").df()

Unnamed: 0,dup_rows
0,0


We load the modeling dataset from the model view and checks that required columns exist. We also select the safest 'NonScore' shock column for modeling when a rolling version exists

In [3]:
cols = con.execute(f"DESCRIBE {MODEL_VIEW}").df()["column_name"].tolist()
cols_set = set(cols)

TEAM_COL = "team" if "team" in cols_set else "team_key"
SEASON_COL = "season"
WEEK_COL = "week"

OUTCOME_OFF = "Inj_Off_Next_w"
OUTCOME_DEF = "Inj_Def_Next_w"

required_base = [
    SEASON_COL, WEEK_COL, TEAM_COL,
    OUTCOME_OFF, OUTCOME_DEF,
    "blowout_flag_w",
    "ST_Shock_NonScore_w",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "offensive_no_play_snaps_w",
    "defensive_no_play_snaps_w",
    "points_for",
    "points_against",
    "short_week_flag_w",
    "days_rest_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
]

missing = [c for c in required_base if c not in cols_set]
if missing:
    raise RuntimeError("Missing required columns in model view, " + ", ".join(missing))

SHOCK_COL_MAIN = "ST_Shock_NonScore_Roll_w" if "ST_Shock_NonScore_Roll_w" in cols_set else "ST_Shock_NonScore_w"
Z_COL_MAIN = "Z_ST_NonScore_Roll_w" if "Z_ST_NonScore_Roll_w" in cols_set else ("Z_ST_NonScore_w" if "Z_ST_NonScore_w" in cols_set else None)

print("team column", TEAM_COL)
print("shock column main", SHOCK_COL_MAIN)
print("z column main", Z_COL_MAIN)

team column team
shock column main ST_Shock_NonScore_w
z column main Z_ST_NonScore_w


We build the modeling frame, fixed effect keys, and the blowout interaction term by using 'SHOCK_COL_MAIN' when available. We also create an expanding 'NonScore' shock alternative when 'ST_Load_NonScore_w' exists.

In [4]:
select_cols = [
    SEASON_COL, WEEK_COL, TEAM_COL,
    OUTCOME_OFF, OUTCOME_DEF,
    "blowout_flag_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "offensive_no_play_snaps_w",
    "defensive_no_play_snaps_w",
    "points_for",
    "points_against",
    "short_week_flag_w",
    "days_rest_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
]

if Z_COL_MAIN is not None:
    select_cols.append(Z_COL_MAIN)

has_st_load_nonscore = "ST_Load_NonScore_w" in cols_set
if has_st_load_nonscore:
    select_cols.append("ST_Load_NonScore_w")

df = con.execute(f"SELECT {', '.join(select_cols)} FROM {MODEL_VIEW}").df()

df[TEAM_COL] = df[TEAM_COL].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df["season_week"] = (df[SEASON_COL] * 100 + df[WEEK_COL]).astype(int)
df["team_season"] = (df[TEAM_COL].astype(str) + "_" + df[SEASON_COL].astype(str)).astype(str)

df["shock_nonscore"] = df[SHOCK_COL_MAIN].fillna(0).astype(int)
df["blowout_flag_w"] = df["blowout_flag_w"].fillna(0).astype(int)
df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"]).astype(int)

lag_cols = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]
for c in lag_cols:
    df[c] = df[c].fillna(0).astype(int)

df["ST_Vol_NonScore_w"] = df["ST_Vol_NonScore_w"].fillna(0).astype(float)
df["Cum_Shocks_NonScore_w"] = df["Cum_Shocks_NonScore_w"].fillna(0).astype(float)

numeric_fill = {
    "offensive_no_play_snaps_w": 0,
    "defensive_no_play_snaps_w": 0,
    "short_week_flag_w": 0,
    "bye_last_week_flag_w": 0,
    "home_flag_w": 0,
    "Inj_Off_Last_w": 0,
    "Inj_Def_Last_w": 0,
}
for k, v in numeric_fill.items():
    df[k] = df[k].fillna(v)

must_not_be_null = [
    OUTCOME_OFF, OUTCOME_DEF,
    "offensive_snaps_w", "defensive_snaps_w",
    "points_for", "points_against",
    "days_rest_w",
]
before = len(df)
df = df.dropna(subset=must_not_be_null).reset_index(drop=True)
after = len(df)

df["shock_nonscore_expanding"] = np.nan
df["shock_x_blowout_expanding"] = np.nan
df["z_nonscore_expanding"] = np.nan

if has_st_load_nonscore:
    df = df.sort_values([TEAM_COL, SEASON_COL, WEEK_COL]).reset_index(drop=True)

    def expanding_z(x: pd.Series) -> pd.Series:
        m = x.expanding().mean().shift(1)
        s = x.expanding().std(ddof=1).shift(1)
        return (x - m) / s

    df["z_nonscore_expanding"] = df.groupby([TEAM_COL, SEASON_COL])["ST_Load_NonScore_w"].transform(expanding_z)
    df["shock_nonscore_expanding"] = ((df["z_nonscore_expanding"] >= 1).fillna(False)).astype(int)
    df["shock_x_blowout_expanding"] = (df["shock_nonscore_expanding"] * df["blowout_flag_w"]).astype(int)

print("rows before dropna", before)
print("rows after dropna", after)
print("has ST_Load_NonScore_w", bool(has_st_load_nonscore))
df.head(3)

rows before dropna 5950
rows after dropna 5950
has ST_Load_NonScore_w True


Unnamed: 0,season,week,team,Inj_Off_Next_w,Inj_Def_Next_w,blowout_flag_w,ST_Shock_NonScore_w,ST_Vol_NonScore_w,Cum_Shocks_NonScore_w,ST_Shock_NonScore_w_minus_1,...,Inj_Def_Last_w,Z_ST_NonScore_w,ST_Load_NonScore_w,season_week,team_season,shock_nonscore,shock_x_blowout,shock_nonscore_expanding,shock_x_blowout_expanding,z_nonscore_expanding
0,2012,1,ARI,0.0,0.0,0,0,0.0,0.0,0,...,0.0,-1.732051,14.0,201201,ARI_2012,0,0,0,0,
1,2012,2,ARI,0.0,0.0,0,0,2.828427,0.0,0,...,0.0,0.0,18.0,201202,ARI_2012,0,0,0,0,
2,2012,3,ARI,0.0,0.0,1,0,2.309401,0.0,0,...,0.0,-1.732051,14.0,201203,ARI_2012,0,0,0,0,-0.707107


We produce outcome distribution checks and simple overdispersion indicators for each outcome. We will then use these checks to inform us whether Poisson or Negative Binomial is needed

In [5]:
def outcome_dispersion_stats(y: pd.Series) -> dict:
    y = y.astype(float)
    mean = float(y.mean())
    var = float(y.var(ddof=1))
    share_zero = float((y == 0).mean())
    return {
        "mean": mean,
        "var": var,
        "var_over_mean": (var / mean) if mean > 0 else np.nan,
        "share_zero": share_zero,
        "max": float(y.max()),
    }

stats_off = outcome_dispersion_stats(df[OUTCOME_OFF])
stats_def = outcome_dispersion_stats(df[OUTCOME_DEF])

print("offense outcome", OUTCOME_OFF, stats_off)
print("defense outcome", OUTCOME_DEF, stats_def)

material_overdispersion_off = (not np.isnan(stats_off["var_over_mean"])) and (stats_off["var_over_mean"] >= 1.5)
material_overdispersion_def = (not np.isnan(stats_def["var_over_mean"])) and (stats_def["var_over_mean"] >= 1.5)

print("material overdispersion offense", bool(material_overdispersion_off))
print("material overdispersion defense", bool(material_overdispersion_def))

offense outcome Inj_Off_Next_w {'mean': 1.9201680672268908, 'var': 2.1612169830110557, 'var_over_mean': 1.125535321568121, 'share_zero': 0.17714285714285713, 'max': 9.0}
defense outcome Inj_Def_Next_w {'mean': 2.083865546218487, 'var': 2.380761656150105, 'var_over_mean': 1.1424737361152615, 'share_zero': 0.15529411764705883, 'max': 10.0}
material overdispersion offense False
material overdispersion defense False


### Poisson versus Negative Binomial choice

We use Poisson as the main model because **Inj_Off_Next_w** and **Inj_Def_Next_w** are non negative integer counts and our dispersion checks show only mild overdispersion. Specifically, the variance to mean ratios are about **1.13** for offense and about **1.14** for defense, which is not materially above **1** under our decision rule, so a Negative Binomial model is not required as the baseline. We still cluster standard errors by **team** to remain robust to within team correlation and residual misspecification.

We list the distinct team codes in the modeling frame and shows their row counts in order to help us confirm whether the extra team categories are relocation or naming variants

In [6]:
team_counts = (
    df[TEAM_COL]
    .value_counts(dropna=False)
    .rename_axis("team")
    .reset_index(name="n_rows")
)

print("n distinct team codes", int(team_counts.shape[0]))
print(team_counts.head(50).to_string(index=False))

print()
print("teams with very small counts")
print(team_counts.loc[team_counts["n_rows"] < 10].to_string(index=False))

n distinct team codes 35
team  n_rows
  TB     187
 MIA     187
 ARI     186
  KC     186
 TEN     186
  SF     186
 SEA     186
 PIT     186
 PHI     186
 NYJ     186
 NYG     186
  NO     186
  NE     186
 MIN     186
 ATL     186
 WAS     186
 CAR     186
 BAL     186
 IND     186
 HOU     186
  GB     186
 DET     186
 DEN     186
 DAL     186
 JAX     186
 CLE     186
 CHI     186
 CIN     184
 BUF     184
  LA     130
 LAC     116
 OAK     112
  LV      74
  SD      70
 STL      56

teams with very small counts
Empty DataFrame
Columns: [team, n_rows]
Index: []


We switch the Poisson estimator to GLM Poisson with clustered standard errors to avoid singular Hessian issues, selecting the first non-singular specification from a small ordered set of minimal fallbacks

In [7]:
FE_TEAM = f"C({TEAM_COL})"
FE_TIME = "C(season_week)"

cluster_groups = df[TEAM_COL]

def fit_count_model_poisson_glm(formula: str, data: pd.DataFrame, cluster_groups: pd.Series):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(cov_type="cluster", cov_kwds={"groups": cluster_groups})
    return r

def fit_count_model_negative_binomial_mle(formula: str, data: pd.DataFrame, cluster_groups: pd.Series, maxiter: int = 200):
    m = smf.negativebinomial(formula=formula, data=data)
    r = m.fit(disp=False, maxiter=maxiter, cov_type="cluster", cov_kwds={"groups": cluster_groups})
    return r

def _safe_float(x):
    try:
        return float(x)
    except Exception:
        return np.nan

def print_nb_compare(pois_res, nb_res, name: str):
    if nb_res is None:
        print(name, "negative binomial fit failed")
        return

    aic_pois = _safe_float(getattr(pois_res, "aic", np.nan))
    aic_nb = _safe_float(getattr(nb_res, "aic", np.nan))

    alpha = np.nan
    try:
        if "alpha" in nb_res.params.index:
            alpha = _safe_float(nb_res.params["alpha"])
    except Exception:
        pass

    print(name, "aic_poisson", aic_pois, "aic_nb", aic_nb, "alpha", alpha)

def build_formula_custom(outcome: str, exposure_terms: list, controls_terms: list) -> str:
    rhs = exposure_terms + controls_terms + [FE_TEAM, FE_TIME]
    return outcome + " ~ " + " + ".join(rhs)

def make_controls(side: str, drop_points_against: bool, drop_no_play: bool) -> list:
    common = [
        "blowout_flag_w",
        "short_week_flag_w",
        "days_rest_w",
        "bye_last_week_flag_w",
        "home_flag_w",
        "points_for",
        "points_against",
        "Inj_Off_Last_w",
        "Inj_Def_Last_w",
    ]

    if drop_points_against:
        common = [c for c in common if c != "points_against"]

    if side == "off":
        extra = ["offensive_snaps_w", "offensive_no_play_snaps_w", "defensive_snaps_w"]
        if drop_no_play:
            extra = [c for c in extra if c != "offensive_no_play_snaps_w"]
        return common + extra

    extra = ["defensive_snaps_w", "defensive_no_play_snaps_w", "offensive_snaps_w"]
    if drop_no_play:
        extra = [c for c in extra if c != "defensive_no_play_snaps_w"]
    return common + extra

exposure_specs = [
    ("full_exposure", [
        "shock_nonscore",
        "shock_x_blowout",
        "ST_Vol_NonScore_w",
        "Cum_Shocks_NonScore_w",
        "ST_Shock_NonScore_w_minus_1",
        "ST_Shock_NonScore_w_minus_2",
        "ST_Shock_NonScore_w_minus_3",
    ]),
    ("drop_lags_keep_cum", [
        "shock_nonscore",
        "shock_x_blowout",
        "ST_Vol_NonScore_w",
        "Cum_Shocks_NonScore_w",
    ]),
    ("drop_cum_keep_lags", [
        "shock_nonscore",
        "shock_x_blowout",
        "ST_Vol_NonScore_w",
        "ST_Shock_NonScore_w_minus_1",
        "ST_Shock_NonScore_w_minus_2",
        "ST_Shock_NonScore_w_minus_3",
    ]),
]

control_specs = [
    ("all_controls", False, False),
    ("drop_points_against", True, False),
    ("drop_no_play", False, True),
    ("drop_points_against_and_no_play", True, True),
]

def select_and_fit(outcome: str, side: str):
    last_err = None
    for exp_name, exp_terms in exposure_specs:
        for ctrl_name, drop_pa, drop_np in control_specs:
            controls = make_controls(side, drop_pa, drop_np)
            f = build_formula_custom(outcome, exp_terms, controls)

            used_terms = exp_terms + controls
            zero_var = [t for t in used_terms if (t in df.columns and float(df[t].std(ddof=0)) == 0.0)]
            if len(zero_var) != 0:
                continue

            try:
                res = fit_count_model_poisson_glm(f, df, cluster_groups)
                tag = exp_name + "__" + ctrl_name
                return res, f, tag
            except Exception as e:
                last_err = e
                continue

    raise RuntimeError("No Poisson GLM specification fit successfully, last error was, " + str(last_err))

pois_off, formula_off_used, spec_off_used = select_and_fit(OUTCOME_OFF, "off")
pois_def, formula_def_used, spec_def_used = select_and_fit(OUTCOME_DEF, "def")

print("selected offense spec", spec_off_used)
print(formula_off_used)
print()
print("selected defense spec", spec_def_used)
print(formula_def_used)

nb_off = None
nb_def = None

try:
    nb_off = fit_count_model_negative_binomial_mle(formula_off_used, df, cluster_groups)
except Exception as e:
    print("negative binomial offense failed", str(e))

try:
    nb_def = fit_count_model_negative_binomial_mle(formula_def_used, df, cluster_groups)
except Exception as e:
    print("negative binomial defense failed", str(e))

print()
print_nb_compare(pois_off, nb_off, "nb compare offense")
print_nb_compare(pois_def, nb_def, "nb compare defense")

selected offense spec full_exposure__drop_no_play
Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + blowout_flag_w + short_week_flag_w + days_rest_w + bye_last_week_flag_w + home_flag_w + points_for + points_against + Inj_Off_Last_w + Inj_Def_Last_w + offensive_snaps_w + defensive_snaps_w + C(team) + C(season_week)

selected defense spec full_exposure__drop_no_play
Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + blowout_flag_w + short_week_flag_w + days_rest_w + bye_last_week_flag_w + home_flag_w + points_for + points_against + Inj_Off_Last_w + Inj_Def_Last_w + defensive_snaps_w + offensive_snaps_w + C(team) + C(season_week)

nb compare offense aic_poisson 20186.24559748423 aic_nb 20188.245884653217 alpha 1.695551607

We report the key exposure terms from the selected Poisson GLM specifications by printing coefficients as incidence rate ratios with 95 percent confidence intervals and p values

In [8]:
key_terms = ["shock_nonscore", "shock_x_blowout", "ST_Vol_NonScore_w", "Cum_Shocks_NonScore_w"]

def print_key_terms(res, name: str):
    print(name, "nobs", int(res.nobs))
    for t in key_terms:
        if t in res.params.index:
            beta = float(res.params[t])
            se = float(res.bse[t])
            p = float(res.pvalues[t])
            irr = float(np.exp(beta))
            ci_lo = float(np.exp(beta - 1.96 * se))
            ci_hi = float(np.exp(beta + 1.96 * se))
            print(t, "beta", beta, "se", se, "irr", irr, "ci", (ci_lo, ci_hi), "p", p)

print_key_terms(pois_off, "poisson offense selected")
print()
print_key_terms(pois_def, "poisson defense selected")

poisson offense selected nobs 5950
shock_nonscore beta 0.025488384819260957 se 0.03903925557951034 irr 1.0258159911631042 ci (0.9502515346513779, 1.1073893693968118) p 0.5138264490442133
shock_x_blowout beta 0.11812193723375507 se 0.06527638362578585 irr 1.125381328887374 ci (0.9902282818843683, 1.2789809769907206) p 0.07036308601380233
ST_Vol_NonScore_w beta 0.020198359972737372 se 0.016171352700463666 irr 1.0204037272088444 ci (0.9885683522462867, 1.0532643131208563) p 0.2116573738408123
Cum_Shocks_NonScore_w beta -0.016212444780333186 se 0.018855781661990773 irr 0.9839182695495867 ci (0.9482190131900751, 1.020961558128337) p 0.38989217347201954

poisson defense selected nobs 5950
shock_nonscore beta 0.026522697874523667 se 0.038001141722241026 irr 1.0268775549350568 ci (0.9531723517449845, 1.1062821019713336) p 0.4852117385779823
shock_x_blowout beta 0.07485731265567516 se 0.0668112042836124 irr 1.0777303614298173 ci (0.9454515378841218, 1.2285164129586545) p 0.26253034335462255
ST_

We find team season groups with all zero outcomes which cause Poisson GLM with team season fixed effects to fail by building filtered dataframes for the team season robustness fits for offense and defense

In [9]:
def all_zero_team_seasons(data: pd.DataFrame, outcome_col: str) -> pd.DataFrame:
    g = (
        data.groupby("team_season")[outcome_col]
            .agg(n="size", sum_y="sum", max_y="max")
            .reset_index()
    )
    g["all_zero"] = (g["sum_y"] == 0) & (g["max_y"] == 0)
    return g

g_off = all_zero_team_seasons(df, OUTCOME_OFF)
g_def = all_zero_team_seasons(df, OUTCOME_DEF)

n_all_zero_off = int(g_off["all_zero"].sum())
n_all_zero_def = int(g_def["all_zero"].sum())

print("team seasons all zero offense", n_all_zero_off, "out of", len(g_off))
print("team seasons all zero defense", n_all_zero_def, "out of", len(g_def))

bad_ts_off = set(g_off.loc[g_off["all_zero"], "team_season"].tolist())
bad_ts_def = set(g_def.loc[g_def["all_zero"], "team_season"].tolist())

df_off_ts = df.loc[~df["team_season"].isin(bad_ts_off)].copy()
df_def_ts = df.loc[~df["team_season"].isin(bad_ts_def)].copy()

print("rows kept offense", len(df_off_ts), "rows dropped", len(df) - len(df_off_ts))
print("rows kept defense", len(df_def_ts), "rows dropped", len(df) - len(df_def_ts))

cluster_groups_off_ts = df_off_ts[TEAM_COL]
cluster_groups_def_ts = df_def_ts[TEAM_COL]

team seasons all zero offense 20 out of 416
team seasons all zero defense 20 out of 416
rows kept offense 5670 rows dropped 280
rows kept defense 5670 rows dropped 280


### Team season fixed effects robustness, handling all zero seasons

When we use team season fixed effects, some team seasons have the outcome equal to zero in every week, which creates perfect separation in a Poisson fixed effects model. In that case the maximum likelihood estimate for that team season fixed effect diverges and the IRLS weights become invalid, so estimation fails. For the team season robustness only, we drop team seasons where the outcome is always zero, then refit the same specification with clustered standard errors by team on the remaining sample.

We fit the team season fixed effects Poisson GLM robustness on the filtered samples that avoid all zero team seasons by printing the key term Incidence Rate Ratios (IRR) with confidence intervals and p values

In [10]:
FE_TEAM_SEASON = "C(team_season)"

def rebuild_with_team_season_fe(formula_used: str) -> str:
    return formula_used.replace(FE_TEAM, FE_TEAM_SEASON)

def fit_poisson_glm_cluster_df(formula: str, data: pd.DataFrame, groups: pd.Series):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(cov_type="cluster", cov_kwds={"groups": groups})
    return r

formula_off_ts = rebuild_with_team_season_fe(formula_off_used)
formula_def_ts = rebuild_with_team_season_fe(formula_def_used)

pois_off_ts = fit_poisson_glm_cluster_df(formula_off_ts, df_off_ts, cluster_groups_off_ts)
pois_def_ts = fit_poisson_glm_cluster_df(formula_def_ts, df_def_ts, cluster_groups_def_ts)

print_key_terms(pois_off_ts, "poisson offense team season FE filtered")
print()
print_key_terms(pois_def_ts, "poisson defense team season FE filtered")

poisson offense team season FE filtered nobs 5670
shock_nonscore beta 0.045841564052759055 se 0.04110672216197231 irr 1.0469085298719158 ci (0.9658684720048896, 1.1347481584563284) p 0.2647715267768147
shock_x_blowout beta 0.05671267889087783 se 0.06575453632202097 irr 1.0583516799275225 ci (0.9303762429834855, 1.2039304387368064) p 0.38841752490533676
ST_Vol_NonScore_w beta 0.010195344240754288 se 0.016947657770821637 irr 1.010247493839876 ci (0.9772409207241854, 1.0443688727785094) p 0.5474548342189756
Cum_Shocks_NonScore_w beta -0.005570881450926289 se 0.024019911921540148 irr 0.9944446071341061 ci (0.9487120900879933, 1.0423816529695369) p 0.8165942172600991

poisson defense team season FE filtered nobs 5670
shock_nonscore beta 0.031216997588412922 se 0.03548771056704355 irr 1.0317093580403764 ci (0.9623867182004553, 1.1060254462555632) p 0.3790453858304871
shock_x_blowout beta 0.04444868796055591 se 0.05919437022365741 irr 1.0454513310941964 ci (0.930928989777598, 1.17406214403930

We replace 'shock_nonscore' with the expanding shock built from prior weeks only by refitting the same selected formulas and prints the key IRRs

In [11]:
if "shock_nonscore_expanding" not in df.columns:
    raise RuntimeError("Missing shock_nonscore_expanding, confirm ST_Load_NonScore_w exists and the modeling frame cell ran")

df_exp = df.copy()
df_exp["shock_nonscore"] = df_exp["shock_nonscore_expanding"].fillna(0).astype(int)
df_exp["shock_x_blowout"] = (df_exp["shock_nonscore"] * df_exp["blowout_flag_w"]).astype(int)

cluster_groups_exp = df_exp[TEAM_COL]

pois_off_exp = fit_count_model_poisson_glm(formula_off_used, df_exp, cluster_groups_exp)
pois_def_exp = fit_count_model_poisson_glm(formula_def_used, df_exp, cluster_groups_exp)

print_key_terms(pois_off_exp, "poisson offense expanding shock")
print()
print_key_terms(pois_def_exp, "poisson defense expanding shock")

poisson offense expanding shock nobs 5950
shock_nonscore beta 0.018023834994610573 se 0.03782091596700432 irr 1.0181872445880638 ci (0.9454397079385538, 1.0965323926390569) p 0.6336773692947528
shock_x_blowout beta 0.07040888660469136 se 0.05867404097420273 irr 1.07294680515056 ci (0.9563873848737958, 1.203711869155099) p 0.23013909265144894
ST_Vol_NonScore_w beta 0.021777538790413845 se 0.015948054429327226 irr 1.0220164001746834 ci (0.990564151762786, 1.0544673157889053) p 0.17208668773403157
Cum_Shocks_NonScore_w beta -0.007096519359873151 se 0.017188600504410814 irr 0.9929286014750173 ci (0.9600343860606866, 1.026949890485289) p 0.6797077499600432

poisson defense expanding shock nobs 5950
shock_nonscore beta 0.02467099743710962 se 0.0327731515261652 irr 1.0249778447073525 ci (0.9612080698515773, 1.0929783208157546) p 0.45158167901670443
shock_x_blowout beta 0.03432705310817552 se 0.05402116013885113 irr 1.0349230261769686 ci (0.9309456134447828, 1.1505136869897532) p 0.52514333238

We then tidy the model outputs into one table and writes it to DuckDB and csv for reporting. We included the main Poisson, team season FE robustness, and the shock robustness

In [12]:
def tidy_res(res, model_name: str, outcome_name: str, spec_tag: str, irr_terms: list) -> pd.DataFrame:
    params = res.params.copy()
    bse = res.bse.copy()
    pvals = res.pvalues.copy()

    out = pd.DataFrame({
        "model": model_name,
        "spec_tag": spec_tag,
        "outcome": outcome_name,
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": bse.values.astype(float),
        "pvalue": pvals.values.astype(float),
    })

    irr_terms_set = set(irr_terms)
    out["is_irr_term"] = out["term"].apply(lambda x: 1 if x in irr_terms_set else 0)

    out["irr"] = np.nan
    out["ci_low_irr"] = np.nan
    out["ci_high_irr"] = np.nan

    mask = out["is_irr_term"] == 1
    out.loc[mask, "irr"] = np.exp(out.loc[mask, "beta"])
    out.loc[mask, "ci_low_irr"] = np.exp(out.loc[mask, "beta"] - 1.96 * out.loc[mask, "se_cluster"])
    out.loc[mask, "ci_high_irr"] = np.exp(out.loc[mask, "beta"] + 1.96 * out.loc[mask, "se_cluster"])

    out["nobs"] = int(res.nobs)
    out["aic"] = float(getattr(res, "aic", np.nan))
    out["bic"] = float(getattr(res, "bic", np.nan))
    out["llf"] = float(getattr(res, "llf", np.nan))
    return out

key_terms = ["shock_nonscore", "shock_x_blowout", "ST_Vol_NonScore_w", "Cum_Shocks_NonScore_w"]

results = []

results.append(tidy_res(pois_off, "poisson_main", OUTCOME_OFF, spec_off_used, key_terms))
results.append(tidy_res(pois_def, "poisson_main", OUTCOME_DEF, spec_def_used, key_terms))

if nb_off is not None:
    results.append(tidy_res(nb_off, "negative_binomial_main", OUTCOME_OFF, spec_off_used, key_terms))
if nb_def is not None:
    results.append(tidy_res(nb_def, "negative_binomial_main", OUTCOME_DEF, spec_def_used, key_terms))

results.append(tidy_res(pois_off_ts, "poisson_team_season_fe_filtered", OUTCOME_OFF, spec_off_used, key_terms))
results.append(tidy_res(pois_def_ts, "poisson_team_season_fe_filtered", OUTCOME_DEF, spec_def_used, key_terms))
results.append(tidy_res(pois_off_exp, "poisson_expanding_shock", OUTCOME_OFF, spec_off_used, key_terms))
results.append(tidy_res(pois_def_exp, "poisson_expanding_shock", OUTCOME_DEF, spec_def_used, key_terms))

results_df = pd.concat(results, ignore_index=True)

key_keep = set(key_terms)
results_df["is_key_term"] = results_df["term"].apply(lambda x: 1 if x in key_keep else 0)

con.register("step13_results_tmp", results_df)
con.execute("CREATE OR REPLACE TABLE step13_model_results AS SELECT * FROM step13_results_tmp")
con.unregister("step13_results_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)
csv_path = out_dir / "step13_model_results.csv"
results_df.to_csv(csv_path, index=False)

print("wrote duckdb table step13_model_results")
print("wrote csv", csv_path.resolve())

results_df.query("is_key_term == 1").sort_values(["model", "outcome", "term"]).head(30)

wrote duckdb table step13_model_results
wrote csv /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/outputs/step13_model_results.csv


Unnamed: 0,model,spec_tag,outcome,term,beta,se_cluster,pvalue,is_irr_term,irr,ci_low_irr,ci_high_irr,nobs,aic,bic,llf,is_key_term
1042,negative_binomial_main,full_exposure__drop_no_play,Inj_Def_Next_w,Cum_Shocks_NonScore_w,-0.01738,0.014991,0.246308,1,0.98277,0.954314,1.012075,5950,20810.217066,22583.370888,-10140.108533,1
1041,negative_binomial_main,full_exposure__drop_no_play,Inj_Def_Next_w,ST_Vol_NonScore_w,-0.00843,0.012641,0.504835,1,0.991605,0.967339,1.01648,5950,20810.217066,22583.370888,-10140.108533,1
1039,negative_binomial_main,full_exposure__drop_no_play,Inj_Def_Next_w,shock_nonscore,0.026438,0.038,0.486585,1,1.026791,0.953094,1.106186,5950,20810.217066,22583.370888,-10140.108533,1
1040,negative_binomial_main,full_exposure__drop_no_play,Inj_Def_Next_w,shock_x_blowout,0.074996,0.06689,0.26221,1,1.07788,0.945436,1.228877,5950,20810.217066,22583.370888,-10140.108533,1
777,negative_binomial_main,full_exposure__drop_no_play,Inj_Off_Next_w,Cum_Shocks_NonScore_w,-0.016246,0.018857,0.388954,1,0.983886,0.948185,1.02093,5950,20188.245885,21961.399707,-9829.122942,1
776,negative_binomial_main,full_exposure__drop_no_play,Inj_Off_Next_w,ST_Vol_NonScore_w,0.020184,0.016174,0.212056,1,1.020389,0.988549,1.053254,5950,20188.245885,21961.399707,-9829.122942,1
774,negative_binomial_main,full_exposure__drop_no_play,Inj_Off_Next_w,shock_nonscore,0.025552,0.039042,0.512806,1,1.025881,0.950307,1.107465,5950,20188.245885,21961.399707,-9829.122942,1
775,negative_binomial_main,full_exposure__drop_no_play,Inj_Off_Next_w,shock_x_blowout,0.118087,0.06528,0.070462,1,1.125342,0.990187,1.278945,5950,20188.245885,21961.399707,-9829.122942,1
2821,poisson_expanding_shock,full_exposure__drop_no_play,Inj_Def_Next_w,Cum_Shocks_NonScore_w,-0.010566,0.014844,0.476587,1,0.98949,0.961117,1.0187,5950,20811.091322,22577.553997,-10141.545661,1
2820,poisson_expanding_shock,full_exposure__drop_no_play,Inj_Def_Next_w,ST_Vol_NonScore_w,-0.007413,0.012428,0.550845,1,0.992614,0.968728,1.01709,5950,20811.091322,22577.553997,-10141.545661,1
