We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats

from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb or nflpa.duckdb near this notebook")

con = duckdb.connect(str(DB_FILE), read_only=False)

MODEL_VIEW = "team_week_panel_nextweek_model"

exists_df = con.execute(f"""
SELECT
  COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{MODEL_VIEW}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError(f"Missing {MODEL_VIEW}, run notebook 11 to create the model view")

print("connected db", str(DB_FILE))
print("model view", MODEL_VIEW)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
model view team_week_panel_nextweek_model


Quick sanity check to confirm that the modeling view only contains next week eligible rows, has non null offensive and defensive outcomes, and has unique team week keys

In [2]:
con.execute(f"""
SELECT
  COUNT(*) AS rows_model,
  SUM(CASE WHEN has_next_week != 1 THEN 1 ELSE 0 END) AS bad_has_next_week,
  SUM(CASE WHEN Inj_Def_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_def,
  SUM(CASE WHEN Inj_Off_Next_w IS NULL THEN 1 ELSE 0 END) AS null_outcome_off
FROM {MODEL_VIEW}
""").df()

desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
cols = set(desc["column_name"].astype(str).tolist())

if "team" in cols:
    TEAM_COL = "team"
elif "team_key" in cols:
    TEAM_COL = "team_key"
else:
    raise RuntimeError(f"No team id column found in {MODEL_VIEW}, expected team or team_key")

con.execute(f"""
SELECT
  COUNT(*) AS dup_rows
FROM (
  SELECT
    season,
    week,
    {TEAM_COL} AS team_any,
    COUNT(*) AS n
  FROM {MODEL_VIEW}
  GROUP BY 1,2,3
  HAVING COUNT(*) > 1
)
""").df()

Unnamed: 0,dup_rows
0,0


We build a single, clean modeling frame table with robust column detection, consistent names, and consistent missing value handling

In [3]:
def pick_first_present(candidates: list[str], present: set[str]) -> str | None:
    for c in candidates:
        if c in present:
            return c
    return None

desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
present_cols = set(desc["column_name"].astype(str).tolist())

SEASON_COL = pick_first_present(["season"], present_cols)
WEEK_COL = pick_first_present(["week"], present_cols)
TEAM_RAW_COL = pick_first_present(["team", "team_key"], present_cols)

if SEASON_COL is None or WEEK_COL is None:
    raise RuntimeError("Missing season or week columns in model view")
if TEAM_RAW_COL is None:
    raise RuntimeError("Missing team column, expected team or team_key in model view")

OUTCOME_DEF = "Inj_Def_Next_w"
OUTCOME_OFF = "Inj_Off_Next_w"
if OUTCOME_DEF not in present_cols or OUTCOME_OFF not in present_cols:
    raise RuntimeError("Missing Inj_Def_Next_w or Inj_Off_Next_w in model view")

LAG_CANDIDATES = [
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]
LAG_COLS = [c for c in LAG_CANDIDATES if c in present_cols]

SHOCK_COL_MAIN = pick_first_present(
    ["ST_Shock_NonScore_w", "st_shock_nonscore_w", "shock_nonscore"],
    present_cols
)

POINTS_FOR_COL = pick_first_present(["points_for_w", "points_for"], present_cols)
POINTS_AGAINST_COL = pick_first_present(["points_against_w", "points_against"], present_cols)
SCORE_DIFF_COL = pick_first_present(["score_diff_w", "score_diff"], present_cols)
OFF_YPP_COL = pick_first_present(["off_yards_per_play_w", "Off_yards_per_play_w"], present_cols)
CWI_COL = pick_first_present(["Cumulative_Workload_Index_w", "cumulative_workload_index_w"], present_cols)

required_core = [
    SHOCK_COL_MAIN,
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    SCORE_DIFF_COL,
    OFF_YPP_COL,
    CWI_COL,
]
if any(c is None for c in required_core):
    raise RuntimeError("Missing one or more required columns for steps 16 and 17, check your step 10 and step 11 outputs")

select_cols = [
    SEASON_COL,
    WEEK_COL,
    TEAM_RAW_COL,
    OUTCOME_DEF,
    OUTCOME_OFF,
    "Inj_Def_Last_w",
    "Inj_Off_Last_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    SHOCK_COL_MAIN,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS + [
    POINTS_FOR_COL,
    POINTS_AGAINST_COL,
    SCORE_DIFF_COL,
    OFF_YPP_COL,
    CWI_COL,
]

df = con.execute(f"SELECT {', '.join(select_cols)} FROM {MODEL_VIEW}").df()

rename_map = {}

if TEAM_RAW_COL != "team":
    rename_map[TEAM_RAW_COL] = "team"
if POINTS_FOR_COL != "points_for":
    rename_map[POINTS_FOR_COL] = "points_for"
if POINTS_AGAINST_COL != "points_against":
    rename_map[POINTS_AGAINST_COL] = "points_against"
if SCORE_DIFF_COL != "score_diff_w":
    rename_map[SCORE_DIFF_COL] = "score_diff_w"
if OFF_YPP_COL != "off_yards_per_play_w":
    rename_map[OFF_YPP_COL] = "off_yards_per_play_w"
if CWI_COL != "Cumulative_Workload_Index_w":
    rename_map[CWI_COL] = "Cumulative_Workload_Index_w"
if SHOCK_COL_MAIN != "ST_Shock_NonScore_w":
    rename_map[SHOCK_COL_MAIN] = "ST_Shock_NonScore_w"

df = df.rename(columns=rename_map)

df["team"] = df["team"].astype(str)
df[SEASON_COL] = df[SEASON_COL].astype(int)
df[WEEK_COL] = df[WEEK_COL].astype(int)

df["season_week"] = (df[SEASON_COL] * 100 + df[WEEK_COL]).astype(int)

df["blowout_flag_w"] = df["blowout_flag_w"].fillna(0).astype(int)
df["short_week_flag_w"] = df["short_week_flag_w"].fillna(0).astype(int)
df["bye_last_week_flag_w"] = df["bye_last_week_flag_w"].fillna(0).astype(int)
df["home_flag_w"] = df["home_flag_w"].fillna(0).astype(int)

df["Inj_Def_Last_w"] = df["Inj_Def_Last_w"].fillna(0).astype(float)
df["Inj_Off_Last_w"] = df["Inj_Off_Last_w"].fillna(0).astype(float)

df["ST_Shock_NonScore_w"] = df["ST_Shock_NonScore_w"].fillna(0).astype(int)
df["shock_nonscore"] = df["ST_Shock_NonScore_w"].astype(int)
df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"]).astype(int)

for c in LAG_COLS:
    df[c] = df[c].fillna(0).astype(int)

must_not_be_null = [
    OUTCOME_DEF,
    OUTCOME_OFF,
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
    "offensive_snaps_w",
    "defensive_snaps_w",
    "points_for",
    "points_against",
    "score_diff_w",
    "off_yards_per_play_w",
    "Cumulative_Workload_Index_w",
]

before = len(df)
df = df.dropna(subset=must_not_be_null).reset_index(drop=True)
after = len(df)

print("rows before dropna", before)
print("rows after dropna", after)

con.register("step16_modeling_frame_tmp", df)
con.execute("CREATE OR REPLACE TABLE step16_modeling_frame AS SELECT * FROM step16_modeling_frame_tmp")
con.unregister("step16_modeling_frame_tmp")

print("wrote duckdb table step16_modeling_frame")
df.head(3)

rows before dropna 5950
rows after dropna 5950
wrote duckdb table step16_modeling_frame


Unnamed: 0,season,week,team,Inj_Def_Next_w,Inj_Off_Next_w,Inj_Def_Last_w,Inj_Off_Last_w,blowout_flag_w,short_week_flag_w,bye_last_week_flag_w,...,ST_Shock_NonScore_w_minus_2,ST_Shock_NonScore_w_minus_3,points_for,points_against,score_diff_w,off_yards_per_play_w,Cumulative_Workload_Index_w,season_week,shock_nonscore,shock_x_blowout
0,2012,1,ATL,2.0,2.0,0.0,0.0,1,0,0,...,0,0,40,24,16,6.836364,-3.940011,201201,0,0
1,2012,2,ATL,3.0,2.0,2.0,2.0,0,0,0,...,0,0,27,21,6,4.412698,-3.638251,201202,0,0
2,2012,3,ATL,2.0,2.0,3.0,2.0,1,0,0,...,0,0,27,3,24,5.565217,-2.938346,201203,0,0


We patch 'step16_modeling_frame' by rebuilding a prior weeks only 'NonScore' shock from the load column, and then regenerate the lag shock columns so that the model uses a lookahead free exposure

In [4]:
desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
cols = set(desc["column_name"].astype(str).tolist())

load_candidates = [
    "ST_Load_NonScore_w",
    "ST_Load_NonScore",
    "ST_NonScore_Load_w",
    "st_load_nonscore_w",
]
LOAD_COL = None
for c in load_candidates:
    if c in cols:
        LOAD_COL = c
        break

if LOAD_COL is None:
    raise RuntimeError("NonScore load column not found, add the exact name from DESCRIBE into load_candidates")

if "load_nonscore" not in df.columns:
    load_df = con.execute(f"""
    SELECT
      season,
      week,
      team,
      {LOAD_COL} AS load_nonscore
    FROM {MODEL_VIEW}
    WHERE has_next_week = 1
    """).df()

    df = df.merge(load_df, on=["season", "week", "team"], how="left")

if df["load_nonscore"].isna().any():
    raise RuntimeError("load_nonscore has nulls after merge, check join keys or the load column")

df = df.sort_values(["season", "team", "week"]).reset_index(drop=True)

g = df.groupby(["season", "team"], sort=False)

mean_prior = g["load_nonscore"].apply(lambda s: s.expanding().mean().shift(1)).reset_index(level=[0,1], drop=True)
sd_prior = g["load_nonscore"].apply(lambda s: s.expanding().std(ddof=1).shift(1)).reset_index(level=[0,1], drop=True)

z_prior = (df["load_nonscore"] - mean_prior) / sd_prior
shock_prior = (z_prior >= 1).astype(float)
shock_prior = shock_prior.fillna(0).astype(int)

df["ST_Shock_NonScore_w"] = shock_prior
df["shock_nonscore"] = df["ST_Shock_NonScore_w"].astype(int)
df["shock_x_blowout"] = (df["shock_nonscore"] * df["blowout_flag_w"]).astype(int)

lag_map = {
    "ST_Shock_NonScore_w_minus_1": 1,
    "ST_Shock_NonScore_w_minus_2": 2,
    "ST_Shock_NonScore_w_minus_3": 3,
}
for col, k in lag_map.items():
    if col in df.columns:
        df[col] = g["ST_Shock_NonScore_w"].shift(k).fillna(0).astype(int)

con.register("step16_modeling_frame_nolookahead_tmp", df)
con.execute("CREATE OR REPLACE TABLE step16_modeling_frame_nolookahead AS SELECT * FROM step16_modeling_frame_nolookahead_tmp")
con.unregister("step16_modeling_frame_nolookahead_tmp")

print("wrote duckdb table step16_modeling_frame_nolookahead")

wrote duckdb table step16_modeling_frame_nolookahead


We overwrite the patched table by recomputing the prior weeks only shock entirely inside DuckDB using the exact sd equals zero rule and then regenerate the lag shocks and shock interaction fields from that exact shock

In [5]:
con.execute("""
CREATE OR REPLACE TABLE step16_modeling_frame_nolookahead AS
WITH base AS (
  SELECT
    *,
    AVG(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS mean_prior,
    STDDEV_SAMP(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS sd_prior
  FROM step16_modeling_frame_nolookahead
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_prior IS NULL OR sd_prior = 0 THEN NULL
      ELSE (load_nonscore - mean_prior) / sd_prior
    END AS z_prior
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_prior IS NULL THEN 0
      WHEN z_prior >= 1 THEN 1
      ELSE 0
    END AS shock_prior_only
  FROM calc
)
SELECT
  * REPLACE (
    shock_prior_only AS ST_Shock_NonScore_w,
    COALESCE(LAG(shock_prior_only, 1) OVER (PARTITION BY season, team ORDER BY week), 0) AS ST_Shock_NonScore_w_minus_1,
    COALESCE(LAG(shock_prior_only, 2) OVER (PARTITION BY season, team ORDER BY week), 0) AS ST_Shock_NonScore_w_minus_2,
    COALESCE(LAG(shock_prior_only, 3) OVER (PARTITION BY season, team ORDER BY week), 0) AS ST_Shock_NonScore_w_minus_3,
    shock_prior_only AS shock_nonscore,
    (shock_prior_only * blowout_flag_w) AS shock_x_blowout
  )
FROM final
""")

print("rebuilt step16_modeling_frame_nolookahead using duckdb prior only shock rule")

rebuilt step16_modeling_frame_nolookahead using duckdb prior only shock rule


Quick sanity check to confirm the mismatch count is now zero under the same DuckDB prior weeks only rule

In [6]:
mismatch = con.execute("""
WITH base AS (
  SELECT
    season,
    week,
    team,
    load_nonscore,
    ST_Shock_NonScore_w AS shock_patched,
    AVG(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS mean_prior,
    STDDEV_SAMP(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS sd_prior
  FROM step16_modeling_frame_nolookahead
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_prior IS NULL OR sd_prior = 0 THEN NULL
      ELSE (load_nonscore - mean_prior) / sd_prior
    END AS z_prior
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_prior IS NULL THEN 0
      WHEN z_prior >= 1 THEN 1
      ELSE 0
    END AS shock_prior_only
  FROM calc
)
SELECT
  SUM(CASE WHEN shock_patched != shock_prior_only THEN 1 ELSE 0 END) AS n_mismatch,
  CAST(SUM(CASE WHEN shock_patched != shock_prior_only THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS mismatch_rate
FROM final
""").df()

mismatch

Unnamed: 0,n_mismatch,mismatch_rate
0,0.0,0.0


We compute the first pass mean and variance checks for both outcomes so that we can see unconditional overdispersion before running model based tests

In [7]:
def outcome_dispersion_stats(y: pd.Series) -> dict:
    y = y.astype(float)
    mean = float(y.mean())
    var = float(y.var(ddof=1))
    share_zero = float((y == 0).mean())
    return {
        "mean": mean,
        "var": var,
        "var_over_mean": (var / mean) if mean > 0 else np.nan,
        "share_zero": share_zero,
        "max": float(y.max()),
    }

stats_def = outcome_dispersion_stats(df[OUTCOME_DEF])
stats_off = outcome_dispersion_stats(df[OUTCOME_OFF])

print("defense outcome", OUTCOME_DEF)
for k in ["mean", "var", "var_over_mean", "share_zero", "max"]:
    print(k, stats_def[k])

print()
print("offense outcome", OUTCOME_OFF)
for k in ["mean", "var", "var_over_mean", "share_zero", "max"]:
    print(k, stats_off[k])

defense outcome Inj_Def_Next_w
mean 2.083865546218487
var 2.380761656150105
var_over_mean 1.1424737361152615
share_zero 0.15529411764705883
max 10.0

offense outcome Inj_Off_Next_w
mean 1.9201680672268908
var 2.1612169830110557
var_over_mean 1.125535321568121
share_zero 0.17714285714285713
max 9.0


We fit Poisson and Negative Binomial versions of Models A and B using the same predictor blocks and fixed effects structure and store the preferred script spec that matches the existing selection rule

In [8]:
FE_TEAM = "C(team)"
FE_TIME = "C(season_week)"
cluster_groups = df["team"]

OUTCOME_DEF_USED = OUTCOME_DEF
OUTCOME_OFF_USED = OUTCOME_OFF

exposure_terms = [
    "shock_nonscore",
    "shock_x_blowout",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS

control_terms_base_def = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Def_Last_w",
    "Cumulative_Workload_Index_w",
]

control_terms_base_off = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Off_Last_w",
    "Cumulative_Workload_Index_w",
]

script_specs = [
    ("points_for_diff", ["points_for", "score_diff_w"]),
    ("points_against_diff", ["points_against", "score_diff_w"]),
    ("points_for_against", ["points_for", "points_against"]),
]

preferred_order = ["points_for_diff", "points_against_diff", "points_for_against"]

def build_formula(outcome: str, base_controls: list[str], script_terms: list[str]) -> str:
    rhs = exposure_terms + base_controls + script_terms + [FE_TEAM, FE_TIME]
    return outcome + " ~ " + " + ".join(rhs)

def fit_count_model_poisson_glm(formula: str, data: pd.DataFrame, groups: pd.Series, maxiter: int = 200):
    m = smf.glm(formula=formula, data=data, family=sm.families.Poisson())
    r = m.fit(maxiter=maxiter, cov_type="cluster", cov_kwds={"groups": groups})
    return r

def estimate_nb2_alpha_moments(y: np.ndarray, mu: np.ndarray) -> float:
    y = y.astype(float)
    mu = mu.astype(float)
    den = float(np.sum(mu ** 2))
    if den <= 0:
        return 1e-8
    num = float(np.sum((y - mu) ** 2 - mu))
    alpha = num / den
    if not np.isfinite(alpha):
        alpha = 1e-8
    alpha = float(max(alpha, 1e-8))
    alpha = float(min(alpha, 50.0))
    return alpha

def fit_count_model_negative_binomial_glm_nb2(formula: str, data: pd.DataFrame, groups: pd.Series, alpha: float, maxiter: int = 200):
    fam = sm.families.NegativeBinomial(alpha=alpha)
    m = smf.glm(formula=formula, data=data, family=fam)

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=HessianInversionWarning)
        r = m.fit(maxiter=maxiter, cov_type="cluster", cov_kwds={"groups": groups})

    return r

def fit_model_grid(outcome: str, base_controls: list[str]) -> tuple[str, str, object, object]:
    fits = []
    for tag, script_terms in script_specs:
        f = build_formula(outcome, base_controls, script_terms)

        pois = None
        nb = None

        try:
            pois = fit_count_model_poisson_glm(f, df, cluster_groups)
        except Exception as e:
            print("poisson failed", outcome, tag, str(e))
            continue

        try:
            y = df[outcome].to_numpy()
            mu = np.asarray(pois.mu)
            alpha_hat = estimate_nb2_alpha_moments(y, mu)

            nb = fit_count_model_negative_binomial_glm_nb2(
                f,
                df,
                cluster_groups,
                alpha=alpha_hat,
                maxiter=200,
            )

            nb._alpha_hat_mom = alpha_hat
        except Exception as e:
            print("negative binomial glm failed", outcome, tag, str(e))

        fits.append((tag, f, pois, nb))
        print("fit ok", outcome, tag)

    if len(fits) == 0:
        raise RuntimeError(f"No specifications fit successfully for {outcome}")

    fits_sorted = sorted(
        fits,
        key=lambda x: preferred_order.index(x[0]) if x[0] in preferred_order else 999
    )
    return fits_sorted[0]

spec_tag_def, formula_def_used, pois_def, nb_def = fit_model_grid(OUTCOME_DEF_USED, control_terms_base_def)
spec_tag_off, formula_off_used, pois_off, nb_off = fit_model_grid(OUTCOME_OFF_USED, control_terms_base_off)

print()
print("selected Model A spec", spec_tag_def)
print(formula_def_used)
print()
print("selected Model B spec", spec_tag_off)
print(formula_off_used)

if nb_def is not None:
    print()
    print("defense nb alpha hat", float(getattr(nb_def, "_alpha_hat_mom", np.nan)))

if nb_off is not None:
    print()
    print("offense nb alpha hat", float(getattr(nb_off, "_alpha_hat_mom", np.nan)))

fit ok Inj_Def_Next_w points_for_diff
fit ok Inj_Def_Next_w points_against_diff
fit ok Inj_Def_Next_w points_for_against
fit ok Inj_Off_Next_w points_for_diff
fit ok Inj_Off_Next_w points_against_diff
fit ok Inj_Off_Next_w points_for_against

selected Model A spec points_for_diff
Inj_Def_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + offensive_snaps_w + defensive_snaps_w + blowout_flag_w + short_week_flag_w + bye_last_week_flag_w + home_flag_w + off_yards_per_play_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + points_for + score_diff_w + C(team) + C(season_week)

selected Model B spec points_for_diff
Inj_Off_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + offensive_snaps_w + defensive_snaps_w + blowout_flag_w + short_week_fla

We run model based overdispersion diagnostics for each Poisson fit using Pearson dispersion and an approximate z test so the decision is not based only on unconditional variance

In [9]:
def poisson_overdispersion_diagnostics(res) -> dict:
    chi2 = float(res.pearson_chi2)
    df_resid = float(res.df_resid)
    ratio = chi2 / df_resid if df_resid > 0 else np.nan

    z = (chi2 - df_resid) / np.sqrt(2.0 * df_resid) if df_resid > 0 else np.nan
    p = float(2.0 * (1.0 - stats.norm.cdf(abs(z)))) if np.isfinite(z) else np.nan

    dev = float(res.deviance)
    dev_ratio = dev / df_resid if df_resid > 0 else np.nan

    return {
        "pearson_chi2": chi2,
        "df_resid": df_resid,
        "pearson_dispersion": ratio,
        "pearson_z": float(z) if np.isfinite(z) else np.nan,
        "pearson_pvalue": p,
        "deviance": dev,
        "deviance_dispersion": dev_ratio,
        "aic": float(getattr(res, "aic", np.nan)),
        "bic": float(getattr(res, "bic", np.nan)),
        "llf": float(getattr(res, "llf", np.nan)),
        "nobs": int(getattr(res, "nobs", np.nan)),
    }

diag_def = poisson_overdispersion_diagnostics(pois_def)
diag_off = poisson_overdispersion_diagnostics(pois_off)

print("poisson overdispersion diagnostics, defense")
for k in ["pearson_dispersion", "pearson_z", "pearson_pvalue", "deviance_dispersion", "aic", "bic", "llf", "nobs"]:
    print(k, diag_def[k])

print()
print("poisson overdispersion diagnostics, offense")
for k in ["pearson_dispersion", "pearson_z", "pearson_pvalue", "deviance_dispersion", "aic", "bic", "llf", "nobs"]:
    print(k, diag_off[k])

poisson overdispersion diagnostics, defense
pearson_dispersion 1.0702385173098001
pearson_z 3.745102989152402
pearson_pvalue 0.0001803197892711328
deviance_dispersion 1.2057215163718396
aic 20851.390060074904
bic 22617.85273568938
llf -10161.695030037452
nobs 5950

poisson overdispersion diagnostics, offense
pearson_dispersion 1.0460705408275417
pearson_z 2.456471559672583
pearson_pvalue 0.014030890765233117
deviance_dispersion 1.1989179938122556
aic 20223.29171279013
bic 21989.754388404606
llf -9847.645856395065
nobs 5950


We formalize the family choice rule and export a diagnostics table plus the selected family results for Models A and B to DuckDB and csv so that the later steps always reference a single preferred count model per side

In [10]:
def extract_nb_alpha(res) -> float:
    if res is None:
        return np.nan
    if hasattr(res, "_alpha_hat_mom"):
        try:
            return float(res._alpha_hat_mom)
        except Exception:
            pass
    try:
        fam = getattr(getattr(res, "model", None), "family", None)
        if fam is not None and hasattr(fam, "alpha"):
            return float(fam.alpha)
    except Exception:
        pass
    try:
        if hasattr(res, "params") and ("alpha" in res.params.index):
            return float(res.params["alpha"])
    except Exception:
        pass
    return np.nan

alpha_def = extract_nb_alpha(nb_def)
alpha_off = extract_nb_alpha(nb_off)

nb_meta_def = {
    "aic": float(getattr(nb_def, "aic", np.nan)) if nb_def is not None else np.nan,
    "bic": float(getattr(nb_def, "bic", np.nan)) if nb_def is not None else np.nan,
    "llf": float(getattr(nb_def, "llf", np.nan)) if nb_def is not None else np.nan,
    "nobs": int(getattr(nb_def, "nobs", np.nan)) if nb_def is not None else np.nan,
    "alpha": alpha_def,
}
nb_meta_off = {
    "aic": float(getattr(nb_off, "aic", np.nan)) if nb_off is not None else np.nan,
    "bic": float(getattr(nb_off, "bic", np.nan)) if nb_off is not None else np.nan,
    "llf": float(getattr(nb_off, "llf", np.nan)) if nb_off is not None else np.nan,
    "nobs": int(getattr(nb_off, "nobs", np.nan)) if nb_off is not None else np.nan,
    "alpha": alpha_off,
}

def choose_family(uncond_stats: dict, pois_diag: dict, nb_res) -> tuple[str, str]:
    var_over_mean = float(uncond_stats.get("var_over_mean", np.nan))
    pearson_disp = float(pois_diag.get("pearson_dispersion", np.nan))
    pval = float(pois_diag.get("pearson_pvalue", np.nan))

    overdisp_uncond = np.isfinite(var_over_mean) and (var_over_mean >= 1.5)
    overdisp_model = np.isfinite(pearson_disp) and np.isfinite(pval) and (pearson_disp >= 1.2) and (pval < 0.05)

    if (overdisp_uncond or overdisp_model) and (nb_res is not None):
        return "negative_binomial", "overdispersion flagged by unconditional or model based test"
    return "poisson", "no strong overdispersion signal, or NB failed"

choice_def, reason_def = choose_family(stats_def, diag_def, nb_def)
choice_off, reason_off = choose_family(stats_off, diag_off, nb_off)

print("family choice defense", choice_def)
print("reason defense", reason_def)
print()
print("family choice offense", choice_off)
print("reason offense", reason_off)

def tidy_count_res(res, model_name: str, outcome_name: str, spec_tag: str, key_terms: list[str]) -> pd.DataFrame:
    params = res.params.copy()
    bse = res.bse.copy()
    pvals = res.pvalues.copy()

    out = pd.DataFrame({
        "model": model_name,
        "spec_tag": spec_tag,
        "outcome": outcome_name,
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": bse.values.astype(float),
        "pvalue": pvals.values.astype(float),
    })

    out["nobs"] = int(getattr(res, "nobs", np.nan))
    out["aic"] = float(getattr(res, "aic", np.nan))
    out["bic"] = float(getattr(res, "bic", np.nan))
    out["llf"] = float(getattr(res, "llf", np.nan))

    out["irr"] = np.exp(out["beta"].astype(float))
    out["irr_ci_lo"] = np.exp(out["beta"].astype(float) - 1.96 * out["se_cluster"].astype(float))
    out["irr_ci_hi"] = np.exp(out["beta"].astype(float) + 1.96 * out["se_cluster"].astype(float))

    key_keep = set(key_terms)
    out["is_key_term"] = out["term"].apply(lambda x: 1 if x in key_keep else 0)
    return out

key_terms = exposure_terms

selected_def_res = nb_def if choice_def == "negative_binomial" else pois_def
selected_off_res = nb_off if choice_off == "negative_binomial" else pois_off

selected_def_name = f"{choice_def}_modelA_selected"
selected_off_name = f"{choice_off}_modelB_selected"

selected_def_df = tidy_count_res(selected_def_res, selected_def_name, OUTCOME_DEF_USED, spec_tag_def, key_terms)
selected_off_df = tidy_count_res(selected_off_res, selected_off_name, OUTCOME_OFF_USED, spec_tag_off, key_terms)

final_selected = pd.concat([selected_def_df, selected_off_df], ignore_index=True)

diag_rows = []
diag_rows.append({
    "side": "defense",
    "spec_tag": spec_tag_def,
    "formula_used": formula_def_used,
    "outcome_mean": stats_def["mean"],
    "outcome_var": stats_def["var"],
    "outcome_var_over_mean": stats_def["var_over_mean"],
    "poisson_pearson_dispersion": diag_def["pearson_dispersion"],
    "poisson_pearson_pvalue": diag_def["pearson_pvalue"],
    "poisson_deviance_dispersion": diag_def["deviance_dispersion"],
    "poisson_aic": diag_def["aic"],
    "poisson_bic": diag_def["bic"],
    "nb_aic": nb_meta_def["aic"],
    "nb_bic": nb_meta_def["bic"],
    "nb_alpha": nb_meta_def["alpha"],
    "family_choice": choice_def,
    "choice_reason": reason_def,
})
diag_rows.append({
    "side": "offense",
    "spec_tag": spec_tag_off,
    "formula_used": formula_off_used,
    "outcome_mean": stats_off["mean"],
    "outcome_var": stats_off["var"],
    "outcome_var_over_mean": stats_off["var_over_mean"],
    "poisson_pearson_dispersion": diag_off["pearson_dispersion"],
    "poisson_pearson_pvalue": diag_off["pearson_pvalue"],
    "poisson_deviance_dispersion": diag_off["deviance_dispersion"],
    "poisson_aic": diag_off["aic"],
    "poisson_bic": diag_off["bic"],
    "nb_aic": nb_meta_off["aic"],
    "nb_bic": nb_meta_off["bic"],
    "nb_alpha": nb_meta_off["alpha"],
    "family_choice": choice_off,
    "choice_reason": reason_off,
})

diag_df = pd.DataFrame(diag_rows)

con.register("step16_overdisp_tmp", diag_df)
con.execute("CREATE OR REPLACE TABLE step16_overdispersion_diagnostics AS SELECT * FROM step16_overdisp_tmp")
con.unregister("step16_overdisp_tmp")

con.register("step16_selected_tmp", final_selected)
con.execute("CREATE OR REPLACE TABLE step16_selected_count_results AS SELECT * FROM step16_selected_tmp")
con.unregister("step16_selected_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)

diag_csv = out_dir / "step16_overdispersion_diagnostics.csv"
sel_csv = out_dir / "step16_selected_count_results.csv"

diag_df.to_csv(diag_csv, index=False)
final_selected.to_csv(sel_csv, index=False)

print("wrote duckdb table step16_overdispersion_diagnostics")
print("wrote duckdb table step16_selected_count_results")
print("wrote csv", diag_csv.resolve())
print("wrote csv", sel_csv.resolve())

diag_df

family choice defense poisson
reason defense no strong overdispersion signal, or NB failed

family choice offense poisson
reason offense no strong overdispersion signal, or NB failed
wrote duckdb table step16_overdispersion_diagnostics
wrote duckdb table step16_selected_count_results
wrote csv /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/outputs/step16_overdispersion_diagnostics.csv
wrote csv /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/outputs/step16_selected_count_results.csv


Unnamed: 0,side,spec_tag,formula_used,outcome_mean,outcome_var,outcome_var_over_mean,poisson_pearson_dispersion,poisson_pearson_pvalue,poisson_deviance_dispersion,poisson_aic,poisson_bic,nb_aic,nb_bic,nb_alpha,family_choice,choice_reason
0,defense,points_for_diff,Inj_Def_Next_w ~ shock_nonscore + shock_x_blow...,2.083866,2.380762,1.142474,1.070239,0.00018,1.205722,20851.39006,22617.852736,20850.718178,22617.180853,0.006968065,poisson,"no strong overdispersion signal, or NB failed"
1,offense,points_for_diff,Inj_Off_Next_w ~ shock_nonscore + shock_x_blow...,1.920168,2.161217,1.125535,1.046071,0.014031,1.198918,20223.291713,21989.754388,20223.292511,21989.755186,1e-08,poisson,"no strong overdispersion signal, or NB failed"


Quick sanity check to confirm that the outputs exist in DuckDB, the selected results table contains one selected model per side, and key exposure terms are present

In [11]:
con.execute("""
SELECT
  table_name
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name IN ('step16_modeling_frame', 'step16_overdispersion_diagnostics', 'step16_selected_count_results')
ORDER BY table_name
""").df()

con.execute("""
SELECT
  outcome,
  COUNT(DISTINCT model) AS n_models,
  SUM(CASE WHEN is_key_term = 1 THEN 1 ELSE 0 END) AS n_key_rows
FROM step16_selected_count_results
GROUP BY 1
ORDER BY outcome
""").df()

Unnamed: 0,outcome,n_models,n_key_rows
0,Inj_Def_Next_w,1,7.0
1,Inj_Off_Next_w,1,7.0


Quick sanity check to confirm that 'ST_Shock_NonScore_w' is not using full season lookahead by comparing it to a prior weeks only shock rebuild. We also confirm that the Poisson choice rule is driven by dispersion ratios not just p values. We finally confirm the number of cluster groups

In [12]:
desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
cols = set(desc["column_name"].astype(str).tolist())

load_candidates = [
    "ST_Load_NonScore_w",
    "ST_Load_NonScore",
    "ST_NonScore_Load_w",
    "st_load_nonscore_w",
]
LOAD_COL = None
for c in load_candidates:
    if c in cols:
        LOAD_COL = c
        break

if LOAD_COL is None:
    raise RuntimeError(
        "Could not find a NonScore load column in the model view, expected one of "
        + ", ".join(load_candidates)
        + ", search DESCRIBE output for the exact name and add it to load_candidates"
    )

lookahead_check = con.execute(f"""
WITH base AS (
  SELECT
    season,
    week,
    team,
    {LOAD_COL} AS load_nonscore,
    ST_Shock_NonScore_w AS shock_current,
    AVG({LOAD_COL}) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS mean_prior,
    STDDEV_SAMP({LOAD_COL}) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS sd_prior
  FROM {MODEL_VIEW}
  WHERE has_next_week = 1
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_prior IS NULL OR sd_prior = 0 THEN NULL
      ELSE (load_nonscore - mean_prior) / sd_prior
    END AS z_prior
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_prior IS NULL THEN NULL
      WHEN z_prior >= 1 THEN 1
      ELSE 0
    END AS shock_prior_only
  FROM calc
)
SELECT
  COUNT(*) AS n_rows,
  SUM(CASE WHEN shock_prior_only IS NULL THEN 1 ELSE 0 END) AS n_null_prior_only,
  SUM(CASE WHEN shock_prior_only IS NOT NULL AND shock_current IS NOT NULL THEN 1 ELSE 0 END) AS n_comparable,
  SUM(CASE WHEN shock_prior_only IS NOT NULL AND shock_current IS NOT NULL AND shock_prior_only != shock_current THEN 1 ELSE 0 END) AS n_mismatch,
  CAST(SUM(CASE WHEN shock_prior_only IS NOT NULL AND shock_current IS NOT NULL AND shock_prior_only != shock_current THEN 1 ELSE 0 END) AS DOUBLE)
    / NULLIF(SUM(CASE WHEN shock_prior_only IS NOT NULL AND shock_current IS NOT NULL THEN 1 ELSE 0 END), 0) AS mismatch_rate
FROM final
""").df()

lookahead_examples = con.execute(f"""
WITH base AS (
  SELECT
    season,
    week,
    team,
    {LOAD_COL} AS load_nonscore,
    ST_Shock_NonScore_w AS shock_current,
    AVG({LOAD_COL}) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS mean_prior,
    STDDEV_SAMP({LOAD_COL}) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS sd_prior
  FROM {MODEL_VIEW}
  WHERE has_next_week = 1
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_prior IS NULL OR sd_prior = 0 THEN NULL
      ELSE (load_nonscore - mean_prior) / sd_prior
    END AS z_prior
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_prior IS NULL THEN NULL
      WHEN z_prior >= 1 THEN 1
      ELSE 0
    END AS shock_prior_only
  FROM calc
)
SELECT
  season,
  week,
  team,
  load_nonscore,
  mean_prior,
  sd_prior,
  z_prior,
  shock_current,
  shock_prior_only
FROM final
WHERE shock_prior_only IS NOT NULL
  AND shock_current IS NOT NULL
  AND shock_prior_only != shock_current
ORDER BY season, team, week
LIMIT 25
""").df()

disp_rule = con.execute("""
SELECT
  side,
  poisson_pearson_dispersion,
  poisson_pearson_pvalue,
  poisson_deviance_dispersion,
  nb_alpha,
  poisson_aic,
  nb_aic,
  CASE
    WHEN poisson_pearson_dispersion >= 1.2 AND poisson_pearson_pvalue < 0.05 THEN 1
    ELSE 0
  END AS overdisp_by_ratio_rule,
  CASE
    WHEN poisson_pearson_pvalue < 0.05 THEN 1
    ELSE 0
  END AS would_flip_if_using_p_only
FROM step16_overdispersion_diagnostics
ORDER BY side
""").df()

cluster_check = con.execute("""
WITH per_team AS (
  SELECT
    team,
    COUNT(*) AS n_rows
  FROM step16_modeling_frame
  GROUP BY 1
)
SELECT
  COUNT(*) AS n_clusters,
  MIN(n_rows) AS min_rows_per_team,
  MAX(n_rows) AS max_rows_per_team,
  AVG(n_rows) AS avg_rows_per_team
FROM per_team
""").df()

lookahead_check, lookahead_examples, disp_rule, cluster_check

(   n_rows  n_null_prior_only  n_comparable  n_mismatch  mismatch_rate
 0    5950             1088.0        4862.0       379.0       0.077951,
     season  week team  load_nonscore  mean_prior  sd_prior   z_prior  \
 0     2012     4  ARI           20.0   15.333333  2.309401  2.020726   
 1     2012    14  ARI           20.0   17.090909  1.814086  1.603612   
 2     2012    15  ARI           20.0   17.333333  1.922751  1.386902   
 3     2012     4  ATL           16.0   11.000000  4.000000  1.250000   
 4     2012     4  BAL           19.0   13.000000  2.645751  2.267787   
 5     2012     9  BAL           17.0   13.666667  3.204164  1.040313   
 6     2012    11  BAL           19.0   14.125000  2.948971  1.653119   
 7     2012     4  BUF           18.0   16.333333  2.081666  0.800641   
 8     2012    14  BUF           18.0   15.818182  2.400757  0.908804   
 9     2012    10  CHI           17.0   14.000000  2.516611  1.192079   
 10    2012    14  CHI           17.0   14.181818  2.4

Quick sanity check to confirm whether 'ST_Shock_NonScore_w' matches a full season within team season z score definition, which would indicate lookahead leakage

In [13]:
desc = con.execute(f"DESCRIBE {MODEL_VIEW}").df()
cols = set(desc["column_name"].astype(str).tolist())

load_candidates = [
    "ST_Load_NonScore_w",
    "ST_Load_NonScore",
    "ST_NonScore_Load_w",
    "st_load_nonscore_w",
]
LOAD_COL = None
for c in load_candidates:
    if c in cols:
        LOAD_COL = c
        break

if LOAD_COL is None:
    raise RuntimeError("NonScore load column not found, add the exact name from DESCRIBE into load_candidates")

fullseason_check = con.execute(f"""
WITH base AS (
  SELECT
    season,
    week,
    team,
    {LOAD_COL} AS load_nonscore,
    ST_Shock_NonScore_w AS shock_current,
    AVG({LOAD_COL}) OVER (PARTITION BY season, team) AS mean_full,
    STDDEV_SAMP({LOAD_COL}) OVER (PARTITION BY season, team) AS sd_full
  FROM {MODEL_VIEW}
  WHERE has_next_week = 1
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_full IS NULL OR sd_full = 0 THEN NULL
      ELSE (load_nonscore - mean_full) / sd_full
    END AS z_full
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_full IS NULL THEN NULL
      WHEN z_full >= 1 THEN 1
      ELSE 0
    END AS shock_fullseason
  FROM calc
)
SELECT
  COUNT(*) AS n_rows,
  SUM(CASE WHEN shock_fullseason IS NULL THEN 1 ELSE 0 END) AS n_null_fullseason,
  SUM(CASE WHEN shock_fullseason IS NOT NULL AND shock_current IS NOT NULL THEN 1 ELSE 0 END) AS n_comparable,
  SUM(CASE WHEN shock_fullseason IS NOT NULL AND shock_current IS NOT NULL AND shock_fullseason != shock_current THEN 1 ELSE 0 END) AS n_mismatch,
  CAST(SUM(CASE WHEN shock_fullseason IS NOT NULL AND shock_current IS NOT NULL AND shock_fullseason != shock_current THEN 1 ELSE 0 END) AS DOUBLE)
    / NULLIF(SUM(CASE WHEN shock_fullseason IS NOT NULL AND shock_current IS NOT NULL THEN 1 ELSE 0 END), 0) AS mismatch_rate
FROM final
""").df()

fullseason_examples = con.execute(f"""
WITH base AS (
  SELECT
    season,
    week,
    team,
    {LOAD_COL} AS load_nonscore,
    ST_Shock_NonScore_w AS shock_current,
    AVG({LOAD_COL}) OVER (PARTITION BY season, team) AS mean_full,
    STDDEV_SAMP({LOAD_COL}) OVER (PARTITION BY season, team) AS sd_full
  FROM {MODEL_VIEW}
  WHERE has_next_week = 1
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_full IS NULL OR sd_full = 0 THEN NULL
      ELSE (load_nonscore - mean_full) / sd_full
    END AS z_full
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_full IS NULL THEN NULL
      WHEN z_full >= 1 THEN 1
      ELSE 0
    END AS shock_fullseason
  FROM calc
)
SELECT
  season,
  week,
  team,
  load_nonscore,
  mean_full,
  sd_full,
  z_full,
  shock_current,
  shock_fullseason
FROM final
WHERE shock_fullseason IS NOT NULL
  AND shock_current IS NOT NULL
  AND shock_fullseason != shock_current
ORDER BY season, team, week
LIMIT 25
""").df()

fullseason_check, fullseason_examples

(   n_rows  n_null_fullseason  n_comparable  n_mismatch  mismatch_rate
 0    5950              238.0        5712.0       141.0       0.024685,
     season  week team  load_nonscore  mean_full   sd_full    z_full  \
 0     2012     4  ATL           16.0  12.000000  3.762160  1.063219   
 1     2012     5  ATL           16.0  12.000000  3.762160  1.063219   
 2     2012     2  BUF           18.0  15.857143  2.248320  0.953092   
 3     2012     4  BUF           18.0  15.857143  2.248320  0.953092   
 4     2012    14  BUF           18.0  15.857143  2.248320  0.953092   
 5     2012     3  HOU           19.0  16.571429  2.502746  0.970363   
 6     2012    13  HOU           19.0  16.571429  2.502746  0.970363   
 7     2012    14  HOU           19.0  16.571429  2.502746  0.970363   
 8     2012     4  MIA           18.0  15.071429  3.197698  0.915837   
 9     2012    10  MIA           18.0  15.071429  3.197698  0.915837   
 10    2012    11  MIA           18.0  15.071429  3.197698  0.915

Quick sanity check to confirm the patched table has the same row count as before, that the new shock is binary and non null, and taht the lags are binary and non null

In [14]:
con.execute("""
SELECT
  COUNT(*) AS n_rows,
  SUM(CASE WHEN ST_Shock_NonScore_w IS NULL THEN 1 ELSE 0 END) AS n_null_shock,
  SUM(CASE WHEN ST_Shock_NonScore_w NOT IN (0, 1) THEN 1 ELSE 0 END) AS n_bad_shock,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_1 IS NULL THEN 1 ELSE 0 END) AS n_null_m1,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_1 NOT IN (0, 1) THEN 1 ELSE 0 END) AS n_bad_m1,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_2 IS NULL THEN 1 ELSE 0 END) AS n_null_m2,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_2 NOT IN (0, 1) THEN 1 ELSE 0 END) AS n_bad_m2,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_3 IS NULL THEN 1 ELSE 0 END) AS n_null_m3,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_3 NOT IN (0, 1) THEN 1 ELSE 0 END) AS n_bad_m3
FROM step16_modeling_frame_nolookahead
""").df()

Unnamed: 0,n_rows,n_null_shock,n_bad_shock,n_null_m1,n_bad_m1,n_null_m2,n_bad_m2,n_null_m3,n_bad_m3
0,5950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Quick sanity check to confirm that the patched shock matches the prior weeks only definition exactly and that the row count stays the same

In [15]:
mismatch_summary = con.execute("""
WITH base AS (
  SELECT
    season,
    week,
    team,
    load_nonscore,
    ST_Shock_NonScore_w AS shock_patched,
    AVG(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS mean_prior,
    STDDEV_SAMP(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS sd_prior
  FROM step16_modeling_frame_nolookahead
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_prior IS NULL OR sd_prior = 0 THEN NULL
      ELSE (load_nonscore - mean_prior) / sd_prior
    END AS z_prior
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_prior IS NULL THEN 0
      WHEN z_prior >= 1 THEN 1
      ELSE 0
    END AS shock_prior_only
  FROM calc
),
mismatches AS (
  SELECT
    season,
    week,
    team,
    load_nonscore,
    mean_prior,
    sd_prior,
    z_prior,
    shock_patched,
    shock_prior_only
  FROM final
  WHERE shock_patched != shock_prior_only
)
SELECT
  (SELECT COUNT(*) FROM step16_modeling_frame_nolookahead) AS n_rows,
  (SELECT SUM(CASE WHEN ST_Shock_NonScore_w IS NULL THEN 1 ELSE 0 END) FROM step16_modeling_frame_nolookahead) AS n_null_shock,
  (SELECT SUM(CASE WHEN ST_Shock_NonScore_w NOT IN (0, 1) THEN 1 ELSE 0 END) FROM step16_modeling_frame_nolookahead) AS n_bad_values,
  COUNT(*) AS n_mismatch,
  CAST(COUNT(*) AS DOUBLE) / NULLIF((SELECT COUNT(*) FROM step16_modeling_frame_nolookahead), 0) AS mismatch_rate
FROM mismatches
""").df()

mismatch_examples = con.execute("""
WITH base AS (
  SELECT
    season,
    week,
    team,
    load_nonscore,
    ST_Shock_NonScore_w AS shock_patched,
    AVG(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS mean_prior,
    STDDEV_SAMP(load_nonscore) OVER (
      PARTITION BY season, team
      ORDER BY week
      ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
    ) AS sd_prior
  FROM step16_modeling_frame_nolookahead
),
calc AS (
  SELECT
    *,
    CASE
      WHEN sd_prior IS NULL OR sd_prior = 0 THEN NULL
      ELSE (load_nonscore - mean_prior) / sd_prior
    END AS z_prior
  FROM base
),
final AS (
  SELECT
    *,
    CASE
      WHEN z_prior IS NULL THEN 0
      WHEN z_prior >= 1 THEN 1
      ELSE 0
    END AS shock_prior_only
  FROM calc
)
SELECT
  season,
  week,
  team,
  load_nonscore,
  mean_prior,
  sd_prior,
  z_prior,
  shock_patched,
  shock_prior_only
FROM final
WHERE shock_patched != shock_prior_only
ORDER BY season, team, week
LIMIT 25
""").df()

mismatch_summary, mismatch_examples

(   n_rows  n_null_shock  n_bad_values  n_mismatch  mismatch_rate
 0    5950           0.0           0.0           0            0.0,
 Empty DataFrame
 Columns: [season, week, team, load_nonscore, mean_prior, sd_prior, z_prior, shock_patched, shock_prior_only]
 Index: [])