We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [2]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb or nflpa.duckdb near this notebook")

con = duckdb.connect(str(DB_FILE), read_only=False)

base_table = "step16_modeling_frame_nolookahead"

exists_df = con.execute(f"""
SELECT
  COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{base_table}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError("Missing step16_modeling_frame, run notebook 16 first")

print("connected db", str(DB_FILE))
print("base table", base_table)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
base table step16_modeling_frame_nolookahead


Quick sanity check to confirm that the binary outcomes have variation and that the predictors used in the logistic models have no missing values in the modeling frame

In [3]:
df = con.execute(f"SELECT * FROM {base_table}").df()

df["Any_Def_Injury_Next_w"] = (df["Inj_Def_Next_w"].astype(float) >= 1.0).astype(int)
df["Any_Off_Injury_Next_w"] = (df["Inj_Off_Next_w"].astype(float) >= 1.0).astype(int)

summary = pd.DataFrame({
    "n_rows": [len(df)],
    "def_rate": [float(df["Any_Def_Injury_Next_w"].mean())],
    "off_rate": [float(df["Any_Off_Injury_Next_w"].mean())],
    "def_any_ones": [int(df["Any_Def_Injury_Next_w"].sum())],
    "off_any_ones": [int(df["Any_Off_Injury_Next_w"].sum())],
})

missing_core = pd.DataFrame({
    "missing_ST_Vol_NonScore_w": [int(df["ST_Vol_NonScore_w"].isna().sum())],
    "missing_Cum_Shocks_NonScore_w": [int(df["Cum_Shocks_NonScore_w"].isna().sum())],
    "missing_points_for": [int(df["points_for"].isna().sum())],
    "missing_points_against": [int(df["points_against"].isna().sum())],
})

summary, missing_core

(   n_rows  def_rate  off_rate  def_any_ones  off_any_ones
 0    5950  0.844706  0.822857          5026          4896,
    missing_ST_Vol_NonScore_w  missing_Cum_Shocks_NonScore_w  \
 0                          0                              0   
 
    missing_points_for  missing_points_against  
 0                   0                       0  )

We fit logistic regression models for offense and defense using the same predictor blocks and fixed effects as the count models so that interpretation stays aligned

In [4]:
FE_TEAM = "C(team)"
FE_TIME = "C(season_week)"
cluster_groups = df["team"]

LAG_COLS = [c for c in ["ST_Shock_NonScore_w_minus_1", "ST_Shock_NonScore_w_minus_2", "ST_Shock_NonScore_w_minus_3"] if c in df.columns]

exposure_terms = [
    "shock_nonscore",
    "shock_x_blowout",
    "ST_Vol_NonScore_w",
    "Cum_Shocks_NonScore_w",
] + LAG_COLS

control_terms_base_def = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Def_Last_w",
    "Cumulative_Workload_Index_w",
]

control_terms_base_off = [
    "offensive_snaps_w",
    "defensive_snaps_w",
    "blowout_flag_w",
    "short_week_flag_w",
    "bye_last_week_flag_w",
    "home_flag_w",
    "off_yards_per_play_w",
    "Inj_Off_Last_w",
    "Cumulative_Workload_Index_w",
]

script_specs = [
    ("points_for_diff", ["points_for", "score_diff_w"]),
    ("points_against_diff", ["points_against", "score_diff_w"]),
    ("points_for_against", ["points_for", "points_against"]),
]

preferred_order = ["points_for_diff", "points_against_diff", "points_for_against"]

def build_formula(outcome: str, base_controls: list[str], script_terms: list[str]) -> str:
    rhs = exposure_terms + base_controls + script_terms + [FE_TEAM, FE_TIME]
    return outcome + " ~ " + " + ".join(rhs)

def fit_logit_glm(formula: str, data: pd.DataFrame, groups: pd.Series):
    m = smf.glm(formula=formula, data=data, family=sm.families.Binomial())
    r = m.fit(cov_type="cluster", cov_kwds={"groups": groups})
    return r

def fit_logit_grid(outcome: str, base_controls: list[str]) -> tuple[str, str, object]:
    fits = []
    for tag, script_terms in script_specs:
        f = build_formula(outcome, base_controls, script_terms)
        try:
            r = fit_logit_glm(f, df, cluster_groups)
        except Exception as e:
            print("logit failed", outcome, tag, str(e))
            continue
        fits.append((tag, f, r))
        print("fit ok", outcome, tag)

    if len(fits) == 0:
        raise RuntimeError(f"No logistic specifications fit successfully for {outcome}")

    fits_sorted = sorted(
        fits,
        key=lambda x: preferred_order.index(x[0]) if x[0] in preferred_order else 999
    )
    return fits_sorted[0]

spec_tag_def, formula_def_used, logit_def = fit_logit_grid("Any_Def_Injury_Next_w", control_terms_base_def)
spec_tag_off, formula_off_used, logit_off = fit_logit_grid("Any_Off_Injury_Next_w", control_terms_base_off)

print()
print("selected logistic spec defense", spec_tag_def)
print(formula_def_used)
print()
print("selected logistic spec offense", spec_tag_off)
print(formula_off_used)

fit ok Any_Def_Injury_Next_w points_for_diff
fit ok Any_Def_Injury_Next_w points_against_diff
fit ok Any_Def_Injury_Next_w points_for_against
fit ok Any_Off_Injury_Next_w points_for_diff
fit ok Any_Off_Injury_Next_w points_against_diff
fit ok Any_Off_Injury_Next_w points_for_against

selected logistic spec defense points_for_diff
Any_Def_Injury_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3 + offensive_snaps_w + defensive_snaps_w + blowout_flag_w + short_week_flag_w + bye_last_week_flag_w + home_flag_w + off_yards_per_play_w + Inj_Def_Last_w + Cumulative_Workload_Index_w + points_for + score_diff_w + C(team) + C(season_week)

selected logistic spec offense points_for_diff
Any_Off_Injury_Next_w ~ shock_nonscore + shock_x_blowout + ST_Vol_NonScore_w + Cum_Shocks_NonScore_w + ST_Shock_NonScore_w_minus_1 + ST_Shock_NonScore_w_minus_2 + ST_Shock_NonScore_w_minus_3

We export tidy logistic regression results with odds ratios and clustered intervals and write both DuckDB tables and csv outputs that match the existing export conventions

In [5]:
def tidy_logit_res(res, model_name: str, outcome_name: str, spec_tag: str, key_terms: list[str]) -> pd.DataFrame:
    params = res.params.copy()
    bse = res.bse.copy()
    pvals = res.pvalues.copy()

    out = pd.DataFrame({
        "model": model_name,
        "spec_tag": spec_tag,
        "outcome": outcome_name,
        "term": params.index.astype(str),
        "beta": params.values.astype(float),
        "se_cluster": bse.values.astype(float),
        "pvalue": pvals.values.astype(float),
    })

    out["nobs"] = int(getattr(res, "nobs", np.nan))
    out["aic"] = float(getattr(res, "aic", np.nan))
    out["bic"] = float(getattr(res, "bic", np.nan))
    out["llf"] = float(getattr(res, "llf", np.nan))

    out["odds_ratio"] = np.exp(out["beta"].astype(float))
    out["or_ci_lo"] = np.exp(out["beta"].astype(float) - 1.96 * out["se_cluster"].astype(float))
    out["or_ci_hi"] = np.exp(out["beta"].astype(float) + 1.96 * out["se_cluster"].astype(float))

    key_keep = set(key_terms)
    out["is_key_term"] = out["term"].apply(lambda x: 1 if x in key_keep else 0)
    return out

key_terms = exposure_terms

logit_def_df = tidy_logit_res(logit_def, "logit_modelA_defense", "Any_Def_Injury_Next_w", spec_tag_def, key_terms)
logit_off_df = tidy_logit_res(logit_off, "logit_modelB_offense", "Any_Off_Injury_Next_w", spec_tag_off, key_terms)

logit_results = pd.concat([logit_def_df, logit_off_df], ignore_index=True)

con.register("step17_logit_tmp", logit_results)
con.execute("CREATE OR REPLACE TABLE step17_logit_results AS SELECT * FROM step17_logit_tmp")
con.unregister("step17_logit_tmp")

out_dir = Path("../outputs")
out_dir.mkdir(parents=True, exist_ok=True)

csv_path = out_dir / "step17_logit_results.csv"
logit_results.to_csv(csv_path, index=False)

print("wrote duckdb table step17_logit_results")
print("wrote csv", csv_path.resolve())

logit_results.query("is_key_term == 1").sort_values(["model", "term"]).head(30)

wrote duckdb table step17_logit_results
wrote csv /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/outputs/step17_logit_results.csv


Unnamed: 0,model,spec_tag,outcome,term,beta,se_cluster,pvalue,nobs,aic,bic,llf,odds_ratio,or_ci_lo,or_ci_hi,is_key_term
249,logit_modelA_defense,points_for_diff,Any_Def_Injury_Next_w,Cum_Shocks_NonScore_w,0.031273,0.072184,0.664845,5950,4657.924056,6424.386732,-2064.962028,1.031767,0.895647,1.188574,1
250,logit_modelA_defense,points_for_diff,Any_Def_Injury_Next_w,ST_Shock_NonScore_w_minus_1,0.083634,0.090913,0.357605,5950,4657.924056,6424.386732,-2064.962028,1.087231,0.909777,1.299298,1
251,logit_modelA_defense,points_for_diff,Any_Def_Injury_Next_w,ST_Shock_NonScore_w_minus_2,-0.044428,0.096585,0.64553,5950,4657.924056,6424.386732,-2064.962028,0.956545,0.791572,1.155901,1
252,logit_modelA_defense,points_for_diff,Any_Def_Injury_Next_w,ST_Shock_NonScore_w_minus_3,0.046378,0.122461,0.7049,5950,4657.924056,6424.386732,-2064.962028,1.04747,0.82395,1.331625,1
248,logit_modelA_defense,points_for_diff,Any_Def_Injury_Next_w,ST_Vol_NonScore_w,-0.021917,0.057706,0.704093,5950,4657.924056,6424.386732,-2064.962028,0.978322,0.873698,1.095474,1
246,logit_modelA_defense,points_for_diff,Any_Def_Injury_Next_w,shock_nonscore,0.067927,0.133606,0.611162,5950,4657.924056,6424.386732,-2064.962028,1.070287,0.823707,1.390683,1
247,logit_modelA_defense,points_for_diff,Any_Def_Injury_Next_w,shock_x_blowout,0.211505,0.261167,0.418029,5950,4657.924056,6424.386732,-2064.962028,1.235536,0.740536,2.061412,1
513,logit_modelB_offense,points_for_diff,Any_Off_Injury_Next_w,Cum_Shocks_NonScore_w,-0.070276,0.077668,0.365555,5950,5200.285247,6966.747923,-2336.142624,0.932136,0.800511,1.085404,1
514,logit_modelB_offense,points_for_diff,Any_Off_Injury_Next_w,ST_Shock_NonScore_w_minus_1,0.045125,0.114413,0.693284,5950,5200.285247,6966.747923,-2336.142624,1.046158,0.836002,1.309144,1
515,logit_modelB_offense,points_for_diff,Any_Off_Injury_Next_w,ST_Shock_NonScore_w_minus_2,0.09779,0.125287,0.43508,5950,5200.285247,6966.747923,-2336.142624,1.102731,0.862627,1.409667,1


We compute predicted probabilities for each logistic model at different exposure levels using g computation on the existing estimation sample so that the contrasts keep the same team and time fixed effects distribution

In [6]:
vol_low = float(df["ST_Vol_NonScore_w"].quantile(0.25))
vol_high = float(df["ST_Vol_NonScore_w"].quantile(0.75))

cum_low = float(df["Cum_Shocks_NonScore_w"].quantile(0.25))
cum_high = float(df["Cum_Shocks_NonScore_w"].quantile(0.75))

def mean_pred_prob(res, base_df: pd.DataFrame, updates: dict) -> float:
    d = base_df.copy()
    for k, v in updates.items():
        d[k] = v
    if "shock_nonscore" in updates:
        d["shock_x_blowout"] = (d["shock_nonscore"].astype(int) * d["blowout_flag_w"].astype(int)).astype(int)
    preds = res.predict(d)
    return float(np.mean(preds))

scenarios = []

def add_scenarios(res, side: str):
    base = mean_pred_prob(res, df, {})
    scenarios.append({"side": side, "scenario": "baseline_observed", "avg_pred_prob": base})

    p0 = mean_pred_prob(res, df, {"shock_nonscore": 0})
    p1 = mean_pred_prob(res, df, {"shock_nonscore": 1})
    scenarios.append({"side": side, "scenario": "shock_0", "avg_pred_prob": p0})
    scenarios.append({"side": side, "scenario": "shock_1", "avg_pred_prob": p1})

    vl = mean_pred_prob(res, df, {"ST_Vol_NonScore_w": vol_low})
    vh = mean_pred_prob(res, df, {"ST_Vol_NonScore_w": vol_high})
    scenarios.append({"side": side, "scenario": "vol_low_p25", "avg_pred_prob": vl})
    scenarios.append({"side": side, "scenario": "vol_high_p75", "avg_pred_prob": vh})

    cl = mean_pred_prob(res, df, {"Cum_Shocks_NonScore_w": cum_low})
    ch = mean_pred_prob(res, df, {"Cum_Shocks_NonScore_w": cum_high})
    scenarios.append({"side": side, "scenario": "cum_shocks_few_p25", "avg_pred_prob": cl})
    scenarios.append({"side": side, "scenario": "cum_shocks_many_p75", "avg_pred_prob": ch})

add_scenarios(logit_def, "defense")
add_scenarios(logit_off, "offense")

pred_df = pd.DataFrame(scenarios)
pred_df["vol_low_p25"] = vol_low
pred_df["vol_high_p75"] = vol_high
pred_df["cum_low_p25"] = cum_low
pred_df["cum_high_p75"] = cum_high

pred_df

Unnamed: 0,side,scenario,avg_pred_prob,vol_low_p25,vol_high_p75,cum_low_p25,cum_high_p75
0,defense,baseline_observed,0.844706,1.94079,3.32666,0.0,2.0
1,defense,shock_0,0.842355,1.94079,3.32666,0.0,2.0
2,defense,shock_1,0.857243,1.94079,3.32666,0.0,2.0
3,defense,vol_low_p25,0.846112,1.94079,3.32666,0.0,2.0
4,defense,vol_high_p75,0.842896,1.94079,3.32666,0.0,2.0
5,defense,cum_shocks_few_p25,0.840408,1.94079,3.32666,0.0,2.0
6,defense,cum_shocks_many_p75,0.847054,1.94079,3.32666,0.0,2.0
7,offense,baseline_observed,0.822857,1.94079,3.32666,0.0,2.0
8,offense,shock_0,0.819054,1.94079,3.32666,0.0,2.0
9,offense,shock_1,0.841711,1.94079,3.32666,0.0,2.0
