We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
from pathlib import Path
import duckdb
import numpy as np
import pandas as pd

import statsmodels.api as sm
import statsmodels.formula.api as smf

DB_DIR = Path("../db")
DB_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

PANEL_TABLE = "team_week_panel"
MODEL_VIEW = "team_week_panel_nextweek_model"

print("db file", (DB_DIR / "nflpa.duckdb").resolve())

existing_views = set(con.execute("SHOW TABLES").df()["name"].tolist())
if MODEL_VIEW not in existing_views:
    raise RuntimeError("Missing model view team_week_panel_nextweek_model, run notebook 11 before step 13")

cols = con.execute("DESCRIBE team_week_panel").df()["column_name"].tolist()
cols_set = set(cols)

if "team_key" in cols_set:
    TEAM_COL = "team_key"
elif "team" in cols_set:
    TEAM_COL = "team"
else:
    raise RuntimeError("Could not find team column in team_week_panel, expected team_key or team")

con.execute("DROP VIEW IF EXISTS team_week_panel_nextweek_model")
con.execute("DROP TABLE IF EXISTS panel_next_week_flags")

con.execute(f"""
CREATE TABLE panel_next_week_flags AS
WITH base AS (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_key
  FROM team_week_panel
)
SELECT
  season,
  week,
  team_key,
  CASE
    WHEN EXISTS (
      SELECT 1
      FROM base b2
      WHERE b2.season = b1.season
        AND b2.team_key = b1.team_key
        AND b2.week = b1.week + 1
    )
    THEN 1
    ELSE 0
  END AS has_next_week
FROM base b1
""")

con.execute(f"""
CREATE VIEW team_week_panel_nextweek_model AS
SELECT
  p.*,
  f.has_next_week
FROM team_week_panel p
JOIN panel_next_week_flags f
  ON p.season = f.season
 AND p.week = f.week
 AND p.{TEAM_COL} = f.team_key
WHERE f.has_next_week = 1
""")

con.execute("SELECT COUNT(*) AS n FROM team_week_panel_nextweek_model").df()
con.execute(f"SELECT COUNT(*) AS n FROM {MODEL_VIEW}").df()

db file /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


Unnamed: 0,n
0,5950


Quick sanity check to confirm that 'has_next_week' is always 1 in the model view and that the view has unique season week team keys

In [2]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week
FROM {MODEL_VIEW}
""").df()

con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_key,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
) d
""").df()

Unnamed: 0,dup_rows
0,0


We load the modeling dataset from the model view and checks that required columns exist. We also select the safest 'NonScore' shock column for modeling when a rolling version exists

In [3]:
cols = con.execute(f"DESCRIBE {MODEL_VIEW}").df()["column_name"].tolist()
cols_set = set(cols)

TEAM_COL = "team" if "team" in cols_set else "team_key"
SEASON_COL = "season"
WEEK_COL = "week"

OUTCOME_OFF = "Inj_Off_Next_w"
OUTCOME_DEF = "Inj_Def_Next_w"

required_base = [
    SEASON_COL, WEEK_COL, TEAM_COL,
    OUTCOME_OFF, OUTCOME_DEF,
    "blowout_flag_w",
    "ST_Shock_NonScore_w",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "offensive_no_play_snaps_w",
    "defensive_no_play_snaps_w",
    "points_for",
    "points_against",
    "short_week_flag_w",
    "days_rest_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
]

missing = [c for c in required_base if c not in cols_set]
if missing:
    raise RuntimeError("Missing required columns in model view, " + ", ".join(missing))

SHOCK_COL_MAIN = "ST_Shock_NonScore_Roll_w" if "ST_Shock_NonScore_Roll_w" in cols_set else "ST_Shock_NonScore_w"
Z_COL_MAIN = "Z_ST_NonScore_Roll_w" if "Z_ST_NonScore_Roll_w" in cols_set else ("Z_ST_NonScore_w" if "Z_ST_NonScore_w" in cols_set else None)

print("team column", TEAM_COL)
print("shock column main", SHOCK_COL_MAIN)
print("z column main", Z_COL_MAIN)

team column team
shock column main ST_Shock_NonScore_w
z column main Z_ST_NonScore_w


We build the modeling frame, fixed effect keys, and the blowout interaction term by using 'SHOCK_COL_MAIN' when available. We also create an expanding 'NonScore' shock alternative when 'ST_Load_NonScore_w' exists.

In [4]:
select_cols = [
    SEASON_COL, WEEK_COL, TEAM_COL,
    OUTCOME_OFF, OUTCOME_DEF,
    "blowout_flag_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "offensive_no_play_snaps_w",
    "defensive_no_play_snaps_w",
    "points_for",
    "points_against",
    "short_week_flag_w",
    "days_rest_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
]

if Z_COL_MAIN is not None:
    select_cols.append(Z_COL_MAIN)

has_st_load_nonscore = "ST_Load_NonScore_w" in cols_set
if has_st_load_nonscore:
    select_cols.append("ST_Load_NonScore_w")

df = con.execute(f"SELECT {', '.join(select_cols)} FROM {MODEL_VIEW}").df()

df[TEAM_COL] = df[TEAM_COL].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df["season_week"] = (df[SEASON_COL] * 100 + df[WEEK_COL]).astype(int)
df["team_season"] = (df[TEAM_COL].astype(str) + "_" + df[SEASON_COL].astype(str)).astype(str)

df["shock_nonscore"] = df[SHOCK_COL_MAIN].fillna(0).astype(int)
df["blowout_flag_w"] = df["blowout_flag_w"].fillna(0).astype(int)
df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"]).astype(int)

lag_cols = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]
for c in lag_cols:
    df[c] = df[c].fillna(0).astype(int)

df["ST_Vol_NonScore_w"] = df["ST_Vol_NonScore_w"].fillna(0).astype(float)
df["Cum_Shocks_NonScore_w"] = df["Cum_Shocks_NonScore_w"].fillna(0).astype(float)

numeric_fill = {
    "offensive_no_play_snaps_w": 0,
    "defensive_no_play_snaps_w": 0,
    "short_week_flag_w": 0,
    "bye_last_week_flag_w": 0,
    "home_flag_w": 0,
    "Inj_Off_Last_w": 0,
    "Inj_Def_Last_w": 0,
}
for k, v in numeric_fill.items():
    df[k] = df[k].fillna(v)

must_not_be_null = [
    OUTCOME_OFF, OUTCOME_DEF,
    "offensive_snaps_w", "defensive_snaps_w",
    "points_for", "points_against",
    "days_rest_w",
]
before = len(df)
df = df.dropna(subset=must_not_be_null).reset_index(drop=True)
after = len(df)

df["shock_nonscore_expanding"] = np.nan
df["shock_x_blowout_expanding"] = np.nan
df["z_nonscore_expanding"] = np.nan

if has_st_load_nonscore:
    df = df.sort_values([TEAM_COL, SEASON_COL, WEEK_COL]).reset_index(drop=True)

    def expanding_z(x: pd.Series) -> pd.Series:
        m = x.expanding().mean().shift(1)
        s = x.expanding().std(ddof=1).shift(1)
        return (x - m) / s

    df["z_nonscore_expanding"] = df.groupby([TEAM_COL, SEASON_COL])["ST_Load_NonScore_w"].transform(expanding_z)
    df["shock_nonscore_expanding"] = ((df["z_nonscore_expanding"] >= 1).fillna(False)).astype(int)
    df["shock_x_blowout_expanding"] = (df["shock_nonscore_expanding"] * df["blowout_flag_w"]).astype(int)

print("rows before dropna", before)
print("rows after dropna", after)
print("has ST_Load_NonScore_w", bool(has_st_load_nonscore))
df.head(3)

rows before dropna 5950
rows after dropna 5950
has ST_Load_NonScore_w True


Unnamed: 0,season,week,team,Inj_Off_Next_w,Inj_Def_Next_w,blowout_flag_w,ST_Shock_NonScore_w,ST_Vol_NonScore_w,Cum_Shocks_NonScore_w,ST_Shock_NonScore_w_minus_1,...,Inj_Def_Last_w,Z_ST_NonScore_w,ST_Load_NonScore_w,season_week,team_season,shock_nonscore,shock_x_blowout,shock_nonscore_expanding,shock_x_blowout_expanding,z_nonscore_expanding
0,2012,1,ARI,0.0,0.0,0,0,0.0,0.0,0,...,0.0,-1.732051,14.0,201201,ARI_2012,0,0,0,0,
1,2012,2,ARI,0.0,0.0,0,0,2.828427,0.0,0,...,0.0,0.0,18.0,201202,ARI_2012,0,0,0,0,
2,2012,3,ARI,0.0,0.0,1,0,2.309401,0.0,0,...,0.0,-1.732051,14.0,201203,ARI_2012,0,0,0,0,-0.707107


We produce outcome distribution checks and simple overdispersion indicators for each outcome. We will then use these checks to inform us whether Poisson or Negative Binomial is needed

In [5]:
def outcome_dispersion_stats(y: pd.Series) -> dict:
    y = y.astype(float)
    mean = float(y.mean())
    var = float(y.var(ddof=1))
    share_zero = float((y == 0).mean())
    return {
        "mean": mean,
        "var": var,
        "var_over_mean": (var / mean) if mean > 0 else np.nan,
        "share_zero": share_zero,
        "max": float(y.max()),
    }

stats_off = outcome_dispersion_stats(df[OUTCOME_OFF])
stats_def = outcome_dispersion_stats(df[OUTCOME_DEF])

print("offense outcome", OUTCOME_OFF, stats_off)
print("defense outcome", OUTCOME_DEF, stats_def)

material_overdispersion_off = (not np.isnan(stats_off["var_over_mean"])) and (stats_off["var_over_mean"] >= 1.5)
material_overdispersion_def = (not np.isnan(stats_def["var_over_mean"])) and (stats_def["var_over_mean"] >= 1.5)

print("material overdispersion offense", bool(material_overdispersion_off))
print("material overdispersion defense", bool(material_overdispersion_def))

offense outcome Inj_Off_Next_w {'mean': 1.9201680672268908, 'var': 2.1612169830110557, 'var_over_mean': 1.125535321568121, 'share_zero': 0.17714285714285713, 'max': 9.0}
defense outcome Inj_Def_Next_w {'mean': 2.083865546218487, 'var': 2.380761656150105, 'var_over_mean': 1.1424737361152615, 'share_zero': 0.15529411764705883, 'max': 10.0}
material overdispersion offense False
material overdispersion defense False


### Poisson versus Negative Binomial choice

We use Poisson as the main model because **Inj_Off_Next_w** and **Inj_Def_Next_w** are non negative integer counts and our dispersion checks show only mild overdispersion. Specifically, the variance to mean ratios are about **1.13** for offense and about **1.14** for defense, which is not materially above **1** under our decision rule, so a Negative Binomial model is not required as the baseline. We still cluster standard errors by **team** to remain robust to within team correlation and residual misspecification.

We define the main specification formulas with team fixed effects and season week fixed effects while also defining reusable Poisson with clustered standard errors by team

In [6]:
FE_TEAM = f"C({TEAM_COL})"
FE_TIME = "C(season_week)"

core_exposure_terms = [
    "shock_nonscore",
    "shock_x_blowout",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]

core_controls_common = [
    "blowout_flag_w",
    "short_week_flag_w",
    "days_rest_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "points_for",
    "points_against",
    "Inj_Off_Last_w",
    "Inj_Def_Last_w",
]

core_controls_off = core_controls_common + [
    "offensive_snaps_w",
    "offensive_no_play_snaps_w",
    "defensive_snaps_w",
]

core_controls_def = core_controls_common + [
    "defensive_snaps_w",
    "defensive_no_play_snaps_w",
    "offensive_snaps_w",
]

def build_formula(outcome: str, side: str) -> str:
    if side == "off":
        rhs = core_exposure_terms + core_controls_off + [FE_TEAM, FE_TIME]
    else:
        rhs = core_exposure_terms + core_controls_def + [FE_TEAM, FE_TIME]
    return outcome + " ~ " + " + ".join(rhs)

formula_off = build_formula(OUTCOME_OFF, "off")
formula_def = build_formula(OUTCOME_DEF, "def")

print("formula offense")
print(formula_off)
print()
print("formula defense")
print(formula_def)

def fit_count_model_poisson(formula: str, data: pd.DataFrame, cluster_groups: pd.Series):
    model = smf.poisson(formula=formula, data=data)
    res = model.fit(disp=False, cov_type="cluster", cov_kwds={"groups": cluster_groups})
    return res

cluster_groups = df[TEAM_COL]

formula offense
Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + blowout_flag_w + short_week_flag_w + days_rest_w + bye_last_week_flag_w + home_flag_w + points_for + points_against + Inj_Off_Last_w + Inj_Def_Last_w + offensive_snaps_w + offensive_no_play_snaps_w + defensive_snaps_w + C(team) + C(season_week)

formula defense
Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + blowout_flag_w + short_week_flag_w + days_rest_w + bye_last_week_flag_w + home_flag_w + points_for + points_against + Inj_Off_Last_w + Inj_Def_Last_w + defensive_snaps_w + defensive_no_play_snaps_w + offensive_snaps_w + C(team) + C(season_week)


We switch the Poisson estimator to GLM Poisson with clustered standard errors to avoid singular Hessian issues, selecting the first non-singular specification from a small ordered set of minimal fallbacks

In [7]:
def fit_count_model_poisson_glm(formula: str, data: pd.DataFrame, cluster_groups: pd.Series):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(cov_type="cluster", cov_kwds={"groups": cluster_groups})
    return r

def build_formula_custom(outcome: str, exposure_terms: list, controls_terms: list) -> str:
    rhs = exposure_terms + controls_terms + [FE_TEAM, FE_TIME]
    return outcome + " ~ " + " + ".join(rhs)

def make_controls(side: str, drop_points_against: bool, drop_no_play: bool) -> list:
    common = [
        "blowout_flag_w",
        "short_week_flag_w",
        "days_rest_w",
        "bye_last_week_flag_w",
        "home_flag_w",
        "points_for",
        "points_against",
        "Inj_Off_Last_w",
        "Inj_Def_Last_w",
    ]
    if drop_points_against:
        common = [c for c in common if c != "points_against"]

    if side == "off":
        extra = ["offensive_snaps_w", "offensive_no_play_snaps_w", "defensive_snaps_w"]
        if drop_no_play:
            extra = [c for c in extra if c != "offensive_no_play_snaps_w"]
        return common + extra

    extra = ["defensive_snaps_w", "defensive_no_play_snaps_w", "offensive_snaps_w"]
    if drop_no_play:
        extra = [c for c in extra if c != "defensive_no_play_snaps_w"]
    return common + extra

exposure_specs = [
    ("full_exposure", [
        "shock_nonscore",
        "shock_x_blowout",
        "ST_Vol_NonScore_w",
        "Cum_Shocks_NonScore_w",
        "ST_Shock_NonScore_w_minus_1",
        "ST_Shock_NonScore_w_minus_2",
        "ST_Shock_NonScore_w_minus_3",
    ]),
    ("drop_lags_keep_cum", [
        "shock_nonscore",
        "shock_x_blowout",
        "ST_Vol_NonScore_w",
        "Cum_Shocks_NonScore_w",
    ]),
    ("drop_cum_keep_lags", [
        "shock_nonscore",
        "shock_x_blowout",
        "ST_Vol_NonScore_w",
        "ST_Shock_NonScore_w_minus_1",
        "ST_Shock_NonScore_w_minus_2",
        "ST_Shock_NonScore_w_minus_3",
    ]),
]

control_specs = [
    ("all_controls", False, False),
    ("drop_points_against", True, False),
    ("drop_no_play", False, True),
    ("drop_points_against_and_no_play", True, True),
]

def select_and_fit(outcome: str, side: str):
    last_err = None
    for exp_name, exp_terms in exposure_specs:
        for ctrl_name, drop_pa, drop_np in control_specs:
            controls = make_controls(side, drop_pa, drop_np)
            f = build_formula_custom(outcome, exp_terms, controls)

            used_terms = exp_terms + controls
            zero_var = [t for t in used_terms if (t in df.columns and float(df[t].std(ddof=0)) == 0.0)]
            if len(zero_var) != 0:
                continue

            try:
                res = fit_count_model_poisson_glm(f, df, cluster_groups)
                tag = exp_name + "__" + ctrl_name
                return res, f, tag
            except Exception as e:
                last_err = e
                continue

    raise RuntimeError("No Poisson GLM specification fit successfully, last error was, " + str(last_err))

pois_off, formula_off_used, spec_off_used = select_and_fit(OUTCOME_OFF, "off")
pois_def, formula_def_used, spec_def_used = select_and_fit(OUTCOME_DEF, "def")

print("selected offense spec", spec_off_used)
print(formula_off_used)
print()
print("selected defense spec", spec_def_used)
print(formula_def_used)

selected offense spec full_exposure__drop_no_play
Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + blowout_flag_w + short_week_flag_w + days_rest_w + bye_last_week_flag_w + home_flag_w + points_for + points_against + Inj_Off_Last_w + Inj_Def_Last_w + offensive_snaps_w + defensive_snaps_w + C(team) + C(season_week)

selected defense spec full_exposure__drop_no_play
Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + blowout_flag_w + short_week_flag_w + days_rest_w + bye_last_week_flag_w + home_flag_w + points_for + points_against + Inj_Off_Last_w + Inj_Def_Last_w + defensive_snaps_w + offensive_snaps_w + C(team) + C(season_week)


We report the key exposure terms from the selected Poisson GLM specifications by printing coefficients as incidence rate ratios with 95 percent confidence intervals and p values

In [8]:
key_terms = ["shock_nonscore", "shock_x_blowout", "ST_Vol_NonScore_w", "Cum_Shocks_NonScore_w"]

def print_key_terms(res, name: str):
    print(name, "nobs", int(res.nobs))
    for t in key_terms:
        if t in res.params.index:
            beta = float(res.params[t])
            se = float(res.bse[t])
            p = float(res.pvalues[t])
            irr = float(np.exp(beta))
            ci_lo = float(np.exp(beta - 1.96 * se))
            ci_hi = float(np.exp(beta + 1.96 * se))
            print(t, "beta", beta, "se", se, "irr", irr, "ci", (ci_lo, ci_hi), "p", p)

print_key_terms(pois_off, "poisson offense selected")
print()
print_key_terms(pois_def, "poisson defense selected")

poisson offense selected nobs 5950
shock_nonscore beta 0.025488384819260957 se 0.03903925557951034 irr 1.0258159911631042 ci (0.9502515346513779, 1.1073893693968118) p 0.5138264490442133
shock_x_blowout beta 0.11812193723375507 se 0.06527638362578585 irr 1.125381328887374 ci (0.9902282818843683, 1.2789809769907206) p 0.07036308601380233
ST_Vol_NonScore_w beta 0.020198359972737372 se 0.016171352700463666 irr 1.0204037272088444 ci (0.9885683522462867, 1.0532643131208563) p 0.2116573738408123
Cum_Shocks_NonScore_w beta -0.016212444780333186 se 0.018855781661990773 irr 0.9839182695495867 ci (0.9482190131900751, 1.020961558128337) p 0.38989217347201954

poisson defense selected nobs 5950
shock_nonscore beta 0.026522697874523667 se 0.038001141722241026 irr 1.0268775549350568 ci (0.9531723517449845, 1.1062821019713336) p 0.4852117385779823
shock_x_blowout beta 0.07485731265567516 se 0.0668112042836124 irr 1.0777303614298173 ci (0.9454515378841218, 1.2285164129586545) p 0.26253034335462255
ST_

We find team season groups with all zero outcomes which cause Poisson GLM with team season fixed effects to fail by building filtered dataframes for the team season robustness fits for offense and defense

In [9]:
def all_zero_team_seasons(data: pd.DataFrame, outcome_col: str) -> pd.DataFrame:
    g = (
        data.groupby("team_season")[outcome_col]
            .agg(n="size", sum_y="sum", max_y="max")
            .reset_index()
    )
    g["all_zero"] = (g["sum_y"] == 0) & (g["max_y"] == 0)
    return g

g_off = all_zero_team_seasons(df, OUTCOME_OFF)
g_def = all_zero_team_seasons(df, OUTCOME_DEF)

n_all_zero_off = int(g_off["all_zero"].sum())
n_all_zero_def = int(g_def["all_zero"].sum())

print("team seasons all zero offense", n_all_zero_off, "out of", len(g_off))
print("team seasons all zero defense", n_all_zero_def, "out of", len(g_def))

bad_ts_off = set(g_off.loc[g_off["all_zero"], "team_season"].tolist())
bad_ts_def = set(g_def.loc[g_def["all_zero"], "team_season"].tolist())

df_off_ts = df.loc[~df["team_season"].isin(bad_ts_off)].copy()
df_def_ts = df.loc[~df["team_season"].isin(bad_ts_def)].copy()

print("rows kept offense", len(df_off_ts), "rows dropped", len(df) - len(df_off_ts))
print("rows kept defense", len(df_def_ts), "rows dropped", len(df) - len(df_def_ts))

cluster_groups_off_ts = df_off_ts[TEAM_COL]
cluster_groups_def_ts = df_def_ts[TEAM_COL]

team seasons all zero offense 20 out of 416
team seasons all zero defense 20 out of 416
rows kept offense 5670 rows dropped 280
rows kept defense 5670 rows dropped 280


### Team season fixed effects robustness, handling all zero seasons

When we use team season fixed effects, some team seasons have the outcome equal to zero in every week, which creates perfect separation in a Poisson fixed effects model. In that case the maximum likelihood estimate for that team season fixed effect diverges and the IRLS weights become invalid, so estimation fails. For the team season robustness only, we drop team seasons where the outcome is always zero, then refit the same specification with clustered standard errors by team on the remaining sample.

We fit the team season fixed effects Poisson GLM robustness on the filtered samples that avoid all zero team seasons by printing the key term IRRs with confidence intervals and p values

In [10]:
FE_TEAM_SEASON = "C(team_season)"

def rebuild_with_team_season_fe(formula_used: str) -> str:
    return formula_used.replace(FE_TEAM, FE_TEAM_SEASON)

def fit_poisson_glm_cluster_df(formula: str, data: pd.DataFrame, groups: pd.Series):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(cov_type="cluster", cov_kwds={"groups": groups})
    return r

formula_off_ts = rebuild_with_team_season_fe(formula_off_used)
formula_def_ts = rebuild_with_team_season_fe(formula_def_used)

pois_off_ts = fit_poisson_glm_cluster_df(formula_off_ts, df_off_ts, cluster_groups_off_ts)
pois_def_ts = fit_poisson_glm_cluster_df(formula_def_ts, df_def_ts, cluster_groups_def_ts)

print_key_terms(pois_off_ts, "poisson offense team season FE filtered")
print()
print_key_terms(pois_def_ts, "poisson defense team season FE filtered")

poisson offense team season FE filtered nobs 5670
shock_nonscore beta 0.045841564052759055 se 0.04110672216197231 irr 1.0469085298719158 ci (0.9658684720048896, 1.1347481584563284) p 0.2647715267768147
shock_x_blowout beta 0.05671267889087783 se 0.06575453632202097 irr 1.0583516799275225 ci (0.9303762429834855, 1.2039304387368064) p 0.38841752490533676
ST_Vol_NonScore_w beta 0.010195344240754288 se 0.016947657770821637 irr 1.010247493839876 ci (0.9772409207241854, 1.0443688727785094) p 0.5474548342189756
Cum_Shocks_NonScore_w beta -0.005570881450926289 se 0.024019911921540148 irr 0.9944446071341061 ci (0.9487120900879933, 1.0423816529695369) p 0.8165942172600991

poisson defense team season FE filtered nobs 5670
shock_nonscore beta 0.031216997588412922 se 0.03548771056704355 irr 1.0317093580403764 ci (0.9623867182004553, 1.1060254462555632) p 0.3790453858304871
shock_x_blowout beta 0.04444868796055591 se 0.05919437022365741 irr 1.0454513310941964 ci (0.930928989777598, 1.17406214403930