We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [2]:
import warnings
from pathlib import Path

import duckdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

try:
    import statsmodels.genmod.generalized_linear_model as glm
    glm.SET_USE_BIC_LLF(True)
except Exception:
    pass

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="The bic value is computed using the deviance formula.*",
)

CWD = Path().resolve()

REPO_ROOT = None
DB_FILE = None

for p in [CWD] + list(CWD.parents):
    cand = p / "db" / "nflpa.duckdb"
    if cand.exists():
        REPO_ROOT = p
        DB_FILE = cand
        break

if DB_FILE is None:
    for p in [CWD] + list(CWD.parents):
        cand = p / "nflpa.duckdb"
        if cand.exists():
            REPO_ROOT = p
            DB_FILE = cand
            break

if DB_FILE is None:
    raise RuntimeError("Could not find nflpa.duckdb, expected db/nflpa.duckdb or nflpa.duckdb near this notebook")

con = duckdb.connect(str(DB_FILE), read_only=False)

base_table = "step16_modeling_frame_nolookahead"

exists_df = con.execute(f"""
SELECT
  COUNT(*) AS n
FROM information_schema.tables
WHERE table_schema = 'main'
  AND table_name = '{base_table}'
  AND table_type IN ('BASE TABLE', 'VIEW')
""").df()

if int(exists_df["n"].iloc[0]) == 0:
    raise RuntimeError("Missing step16_modeling_frame, run notebook 16 first")

print("connected db", str(DB_FILE))
print("base table", base_table)

connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
base table step16_modeling_frame_nolookahead


Quick sanity check to confirm that the binary outcomes have variation and that the predictors used in the logistic models have no missing values in the modeling frame

In [3]:
df = con.execute(f"SELECT * FROM {base_table}").df()

df["Any_Def_Injury_Next_w"] = (df["Inj_Def_Next_w"].astype(float) >= 1.0).astype(int)
df["Any_Off_Injury_Next_w"] = (df["Inj_Off_Next_w"].astype(float) >= 1.0).astype(int)

summary = pd.DataFrame({
    "n_rows": [len(df)],
    "def_rate": [float(df["Any_Def_Injury_Next_w"].mean())],
    "off_rate": [float(df["Any_Off_Injury_Next_w"].mean())],
    "def_any_ones": [int(df["Any_Def_Injury_Next_w"].sum())],
    "off_any_ones": [int(df["Any_Off_Injury_Next_w"].sum())],
})

missing_core = pd.DataFrame({
    "missing_ST_Vol_NonScore_w": [int(df["ST_Vol_NonScore_w"].isna().sum())],
    "missing_Cum_Shocks_NonScore_w": [int(df["Cum_Shocks_NonScore_w"].isna().sum())],
    "missing_points_for": [int(df["points_for"].isna().sum())],
    "missing_points_against": [int(df["points_against"].isna().sum())],
})

summary, missing_core

(   n_rows  def_rate  off_rate  def_any_ones  off_any_ones
 0    5950  0.844706  0.822857          5026          4896,
    missing_ST_Vol_NonScore_w  missing_Cum_Shocks_NonScore_w  \
 0                          0                              0   
 
    missing_points_for  missing_points_against  
 0                   0                       0  )