We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [None]:
import duckdb
import pandas as pd
import numpy as np

from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"
print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

Quick sanity check to confirm that the required tables exist before building any outcomes

In [None]:
required_tables = ["team_week_panel", "injuries", "rosters_weekly"]

tables = set(con.execute("SHOW TABLES").df()["name"].tolist())
missing = [t for t in required_tables if t not in tables]

print("Missing tables", missing)

if missing:
    raise RuntimeError("Step 9 cannot run because tables are missing, missing are " + ", ".join(missing))

We detect key columns dynamically so the logic does not assume specific team or player identifier names, while ensuring the injury counts are calculated accurately regardless of whatever the columns are named

In [None]:
def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _pick_first(existing, candidates, label):
    for c in candidates:
        if c in existing:
            return c
    raise RuntimeError("Could not find a usable column for " + label + ", candidates were " + ", ".join(candidates))

def _pick_common(existing_a, existing_b, candidates, label):
    for c in candidates:
        if c in existing_a and c in existing_b:
            return c
    raise RuntimeError("Could not find a common column for " + label + ", candidates were " + ", ".join(candidates))

panel_cols = _existing_cols("team_week_panel")
inj_cols = _existing_cols("injuries")
rost_cols = _existing_cols("rosters_weekly")

TEAM_COL = "team_id" if "team_id" in panel_cols else "team"
TEAM_ABBR_COL = "team" if "team" in panel_cols else TEAM_COL

print("Using TEAM_COL", TEAM_COL)
print("Using TEAM_ABBR_COL", TEAM_ABBR_COL)

inj_team_candidates = [TEAM_ABBR_COL, "team", "team_abbr", "posteam", "club", "team_id"]
rost_team_candidates = [TEAM_ABBR_COL, "team", "team_abbr", "posteam", "club", "team_id"]

INJ_TEAM_COL = _pick_first(inj_cols, inj_team_candidates, "injuries team column")
ROST_TEAM_COL = _pick_first(rost_cols, rost_team_candidates, "rosters_weekly team column")

season_candidates = ["season", "year"]
week_candidates = ["week"]

INJ_SEASON_COL = _pick_first(inj_cols, season_candidates, "injuries season column")
INJ_WEEK_COL = _pick_first(inj_cols, week_candidates, "injuries week column")

ROST_SEASON_COL = _pick_first(rost_cols, season_candidates, "rosters_weekly season column")
ROST_WEEK_COL = _pick_first(rost_cols, week_candidates, "rosters_weekly week column")

player_key_candidates = [
    "player_id",
    "gsis_id",
    "nfl_id",
    "pfr_id",
    "esb_id",
    "sportradar_id",
]

PLAYER_KEY_COL = _pick_common(inj_cols, rost_cols, player_key_candidates, "player key between injuries and rosters_weekly")

pos_candidates = ["position", "pos", "position_abbr"]
ROST_POS_COL = None
for c in pos_candidates:
    if c in rost_cols:
        ROST_POS_COL = c
        break

players_table_exists = "players" in tables
PLAYERS_PLAYER_KEY_COL = None
PLAYERS_POS_COL = None

if players_table_exists:
    ply_cols = _existing_cols("players")
    for c in player_key_candidates:
        if c in ply_cols:
            PLAYERS_PLAYER_KEY_COL = c
            break
    for c in pos_candidates:
        if c in ply_cols:
            PLAYERS_POS_COL = c
            break

print("Using INJ_TEAM_COL", INJ_TEAM_COL)
print("Using ROST_TEAM_COL", ROST_TEAM_COL)
print("Using PLAYER_KEY_COL", PLAYER_KEY_COL)
print("Using ROST_POS_COL", ROST_POS_COL)
print("Players table exists", players_table_exists)
print("Using PLAYERS_PLAYER_KEY_COL", PLAYERS_PLAYER_KEY_COL)
print("Using PLAYERS_POS_COL", PLAYERS_POS_COL)

Quick sanity check to confirm that the intersection of unique team identifiers across the panel and injury data is nearly 100%, preventing a scenario where injuries are "lost" because of mismatched abbreviations

In [None]:
panel_teams = con.execute(f"SELECT DISTINCT {TEAM_ABBR_COL} AS team_key FROM team_week_panel").df()
inj_teams = con.execute(f"SELECT DISTINCT {INJ_TEAM_COL} AS team_key FROM injuries").df()

panel_set = set(panel_teams["team_key"].dropna().astype(str).tolist())
inj_set = set(inj_teams["team_key"].dropna().astype(str).tolist())

only_in_panel = sorted(list(panel_set - inj_set))[:50]
only_in_inj = sorted(list(inj_set - panel_set))[:50]

print("Example teams only in team_week_panel", only_in_panel)
print("Example teams only in injuries", only_in_inj)
print("Panel team count", len(panel_set))
print("Injuries team count", len(inj_set))
print("Overlap team count", len(panel_set.intersection(inj_set)))

We build a cleaned version of the raw injury reports by removing duplicate player entries within the same game week to ensure that our starting point for counting new injuries is mathematically unique

In [None]:
con.execute("DROP TABLE IF EXISTS injuries_players_distinct")

con.execute(f"""
CREATE TABLE injuries_players_distinct AS
SELECT DISTINCT
  CAST({INJ_SEASON_COL} AS INTEGER) AS season,
  CAST({INJ_WEEK_COL} AS INTEGER) AS week,
  CAST({INJ_TEAM_COL} AS VARCHAR) AS team_key,
  CAST({PLAYER_KEY_COL} AS VARCHAR) AS player_key
FROM injuries
WHERE {INJ_SEASON_COL} IS NOT NULL
  AND {INJ_WEEK_COL} IS NOT NULL
  AND {INJ_TEAM_COL} IS NOT NULL
  AND {PLAYER_KEY_COL} IS NOT NULL
""")

print(con.execute("SELECT COUNT(*) AS rows FROM injuries_players_distinct").df())

We implement a lookup that attaches a list of all currently injured players to every team-week record, which allows the model to identify who was already sidelined before calculating next-week outcomes

In [None]:
con.execute("DROP TABLE IF EXISTS injury_players_w")

con.execute(f"""
CREATE TABLE injury_players_w AS
SELECT
  p.season,
  p.week,
  CAST(p.{TEAM_ABBR_COL} AS VARCHAR) AS team_key,
  i.player_key
FROM team_week_panel p
JOIN injuries_players_distinct i
  ON i.season = p.season
 AND i.week = p.week
 AND i.team_key = CAST(p.{TEAM_ABBR_COL} AS VARCHAR)
""")

print(con.execute("SELECT COUNT(*) AS rows FROM injury_players_w").df())

We implement a look-ahead join that attaches the list of players on the injury report for week 'w+1' to the data for week 'w', which provides us with the material for calculating the upcoming injury risk

In [None]:
con.execute("DROP TABLE IF EXISTS injury_players_w_plus_1")

con.execute(f"""
CREATE TABLE injury_players_w_plus_1 AS
SELECT
  p.season,
  p.week,
  CAST(p.{TEAM_ABBR_COL} AS VARCHAR) AS team_key,
  i_next.player_key
FROM team_week_panel p
JOIN injuries_players_distinct i_next
  ON i_next.season = p.season
 AND i_next.week = p.week + 1
 AND i_next.team_key = CAST(p.{TEAM_ABBR_COL} AS VARCHAR)
""")

print(con.execute("SELECT COUNT(*) AS rows FROM injury_players_w_plus_1").df())

Quick sanity check to confirm that injury player lists do not contain duplicate player entries for the same team week key

In [None]:
dups_w = con.execute("""
SELECT
  COUNT(*) AS n_dup
FROM (
  SELECT season, week, team_key, player_key, COUNT(*) AS c
  FROM injury_players_w
  GROUP BY 1,2,3,4
  HAVING COUNT(*) > 1
)
""").df()["n_dup"].iloc[0]

dups_w1 = con.execute("""
SELECT
  COUNT(*) AS n_dup
FROM (
  SELECT season, week, team_key, player_key, COUNT(*) AS c
  FROM injury_players_w_plus_1
  GROUP BY 1,2,3,4
  HAVING COUNT(*) > 1
)
""").df()["n_dup"].iloc[0]

print("Duplicate rows in injury_players_w", dups_w)
print("Duplicate rows in injury_players_w_plus_1", dups_w1)

if int(dups_w) != 0 or int(dups_w1) != 0:
    raise RuntimeError("Duplicate player entries exist in injury lists, investigate upstream injuries table structure")

We implement a set subtraction operation to isolate "new" injuries to ensure that we only count players who were freshly added to the report rather than those with multi-week conditions

In [None]:
con.execute("DROP TABLE IF EXISTS new_injured_players_w_plus_1_raw")

con.execute("""
CREATE TABLE new_injured_players_w_plus_1_raw AS
SELECT
  n.season,
  n.week,
  n.team_key,
  n.player_key
FROM injury_players_w_plus_1 n
LEFT JOIN injury_players_w c
  ON c.season = n.season
 AND c.week = n.week
 AND c.team_key = n.team_key
 AND c.player_key = n.player_key
WHERE c.player_key IS NULL
""")

print(con.execute("SELECT COUNT(*) AS rows FROM new_injured_players_w_plus_1_raw").df())

We implement a filter that excludes any player from the "new injury" list who was not present on the teamâ€™s roster during the current week to prevent the model from incorrectly attributing a new signing's pre-existing injury to the team's workload

In [None]:
con.execute("DROP TABLE IF EXISTS roster_players_w_distinct")

con.execute(f"""
CREATE TABLE roster_players_w_distinct AS
SELECT DISTINCT
  CAST({ROST_SEASON_COL} AS INTEGER) AS season,
  CAST({ROST_WEEK_COL} AS INTEGER) AS week,
  CAST({ROST_TEAM_COL} AS VARCHAR) AS team_key,
  CAST({PLAYER_KEY_COL} AS VARCHAR) AS player_key
FROM rosters_weekly
WHERE {ROST_SEASON_COL} IS NOT NULL
  AND {ROST_WEEK_COL} IS NOT NULL
  AND {ROST_TEAM_COL} IS NOT NULL
  AND {PLAYER_KEY_COL} IS NOT NULL
""")

con.execute("DROP TABLE IF EXISTS new_injured_players_w_plus_1_eligible")

con.execute("""
CREATE TABLE new_injured_players_w_plus_1_eligible AS
SELECT
  n.season,
  n.week,
  n.team_key,
  n.player_key
FROM new_injured_players_w_plus_1_raw n
JOIN roster_players_w_distinct r
  ON r.season = n.season
 AND r.week = n.week
 AND r.team_key = n.team_key
 AND r.player_key = n.player_key
""")

print(con.execute("SELECT COUNT(*) AS rows FROM new_injured_players_w_plus_1_eligible").df())

Quick sanity check to confirm that roster restriction is not dropping nearly everything, which would signal a player id mismatch

In [None]:
raw_n = int(con.execute("SELECT COUNT(*) AS n FROM new_injured_players_w_plus_1_raw").df()["n"].iloc[0])
elig_n = int(con.execute("SELECT COUNT(*) AS n FROM new_injured_players_w_plus_1_eligible").df()["n"].iloc[0])

drop_rate = 0.0
if raw_n > 0:
    drop_rate = float((raw_n - elig_n) / raw_n)

print("Raw new injury rows", raw_n)
print("Eligible new injury rows", elig_n)
print("Drop rate from roster restriction", drop_rate)

if raw_n > 0 and drop_rate > 0.95:
    raise RuntimeError("Roster restriction dropped over 95 percent, investigate player key alignment across tables")

We implement a categorization logic that translates raw position codes into broad structural groups to ensure that players injured who play on offense are always counted toward offensive injury totals while players injured who play on defense are always counted toward defensive injury totals

In [None]:
if ROST_POS_COL is None and not (players_table_exists and PLAYERS_POS_COL is not None and PLAYERS_PLAYER_KEY_COL is not None):
    raise RuntimeError("No position column found in rosters_weekly and no usable position mapping found in players")

con.execute("DROP TABLE IF EXISTS roster_player_positions_w")

if ROST_POS_COL is not None:
    con.execute(f"""
    CREATE TABLE roster_player_positions_w AS
    WITH base AS (
      SELECT
        CAST({ROST_SEASON_COL} AS INTEGER) AS season,
        CAST({ROST_WEEK_COL} AS INTEGER) AS week,
        CAST({ROST_TEAM_COL} AS VARCHAR) AS team_key,
        CAST({PLAYER_KEY_COL} AS VARCHAR) AS player_key,
        NULLIF(UPPER(TRIM(CAST({ROST_POS_COL} AS VARCHAR))), '') AS pos_clean
      FROM rosters_weekly
      WHERE {ROST_SEASON_COL} IS NOT NULL
        AND {ROST_WEEK_COL} IS NOT NULL
        AND {ROST_TEAM_COL} IS NOT NULL
        AND {PLAYER_KEY_COL} IS NOT NULL
    ),
    collapsed AS (
      SELECT
        season,
        week,
        team_key,
        player_key,
        MIN(pos_clean) AS pos_raw
      FROM base
      WHERE pos_clean IS NOT NULL
      GROUP BY 1,2,3,4
    )
    SELECT
      season,
      week,
      team_key,
      player_key,
      pos_raw
    FROM collapsed
    """)
else:
    con.execute("""
    CREATE TABLE roster_player_positions_w AS
    SELECT
      NULL::INTEGER AS season,
      NULL::INTEGER AS week,
      NULL::VARCHAR AS team_key,
      NULL::VARCHAR AS player_key,
      NULL::VARCHAR AS pos_raw
    WHERE 1 = 0
    """)

con.execute("DROP TABLE IF EXISTS new_injured_players_mapped_side")

off_set = (
    "'QB','RB','HB','TB','FB','WR','TE',"
    "'OL','C','G','T','LT','LG','RG','RT','OT','OG'"
)

def_set = (
    "'DL','DE','DT','NT','EDGE',"
    "'LB','ILB','OLB',"
    "'CB','DB',"
    "'S','FS','SS'"
)

if players_table_exists and PLAYERS_POS_COL is not None and PLAYERS_PLAYER_KEY_COL is not None:
    con.execute(f"""
    CREATE TABLE new_injured_players_mapped_side AS
    WITH base AS (
      SELECT
        n.season,
        n.week,
        n.team_key,
        n.player_key
      FROM new_injured_players_w_plus_1_eligible n
    ),
    with_roster_pos AS (
      SELECT
        b.*,
        rp.pos_raw AS roster_pos
      FROM base b
      LEFT JOIN roster_player_positions_w rp
        ON rp.season = b.season
       AND rp.week = b.week
       AND rp.team_key = b.team_key
       AND rp.player_key = b.player_key
    ),
    with_player_pos AS (
      SELECT
        w.*,
        p.{PLAYERS_POS_COL} AS player_pos
      FROM with_roster_pos w
      LEFT JOIN players p
        ON CAST(p.{PLAYERS_PLAYER_KEY_COL} AS VARCHAR) = w.player_key
    ),
    final AS (
      SELECT
        season,
        week,
        team_key,
        player_key,
        COALESCE(roster_pos, player_pos) AS pos_raw,
        UPPER(TRIM(COALESCE(roster_pos, player_pos))) AS pos_clean
      FROM with_player_pos
    )
    SELECT
      season,
      week,
      team_key,
      player_key,
      pos_raw,
      CASE
        WHEN pos_clean IN ({off_set}) THEN 'OFF'
        WHEN pos_clean IN ({def_set}) THEN 'DEF'
        ELSE 'OTHER'
      END AS pos_side
    FROM final
    """)
else:
    con.execute(f"""
    CREATE TABLE new_injured_players_mapped_side AS
    WITH final AS (
      SELECT
        n.season,
        n.week,
        n.team_key,
        n.player_key,
        rp.pos_raw AS pos_raw,
        UPPER(TRIM(rp.pos_raw)) AS pos_clean
      FROM new_injured_players_w_plus_1_eligible n
      LEFT JOIN roster_player_positions_w rp
        ON rp.season = n.season
       AND rp.week = n.week
       AND rp.team_key = n.team_key
       AND rp.player_key = n.player_key
    )
    SELECT
      season,
      week,
      team_key,
      player_key,
      pos_raw,
      CASE
        WHEN pos_clean IN ({off_set}) THEN 'OFF'
        WHEN pos_clean IN ({def_set}) THEN 'DEF'
        ELSE 'OTHER'
      END AS pos_side
    FROM final
    """)
print(con.execute("SELECT COUNT(*) AS rows FROM new_injured_players_mapped_side").df())

We implement the aggregation of new offensive and defensive injuries into final integer counts for each record to represent the primary outcomes for the following week

In [None]:
con.execute("DROP TABLE IF EXISTS team_week_has_next")

con.execute(f"""
CREATE TABLE team_week_has_next AS
SELECT
  a.season,
  a.week,
  CAST(a.{TEAM_ABBR_COL} AS VARCHAR) AS team_key,
  CASE
    WHEN b.week IS NULL THEN 0
    ELSE 1
  END AS has_next_week
FROM team_week_panel a
LEFT JOIN team_week_panel b
  ON b.season = a.season
 AND CAST(b.{TEAM_ABBR_COL} AS VARCHAR) = CAST(a.{TEAM_ABBR_COL} AS VARCHAR)
 AND b.week = a.week + 1
""")

con.execute("DROP TABLE IF EXISTS injury_counts_next")

con.execute("""
CREATE TABLE injury_counts_next AS
WITH counts AS (
  SELECT
    season,
    week,
    team_key,
    SUM(CASE WHEN pos_side = 'OFF' THEN 1 ELSE 0 END) AS Inj_Off_Next_w,
    SUM(CASE WHEN pos_side = 'DEF' THEN 1 ELSE 0 END) AS Inj_Def_Next_w,
    SUM(CASE WHEN pos_side = 'OTHER' THEN 1 ELSE 0 END) AS Inj_Other_Next_w
  FROM new_injured_players_mapped_side
  GROUP BY 1,2,3
)
SELECT
  t.season,
  t.week,
  t.team_key,
  t.has_next_week,
  CASE WHEN t.has_next_week = 1 THEN COALESCE(c.Inj_Off_Next_w, 0) ELSE NULL END AS Inj_Off_Next_w,
  CASE WHEN t.has_next_week = 1 THEN COALESCE(c.Inj_Def_Next_w, 0) ELSE NULL END AS Inj_Def_Next_w,
  CASE WHEN t.has_next_week = 1 THEN COALESCE(c.Inj_Other_Next_w, 0) ELSE NULL END AS Inj_Other_Next_w
FROM team_week_has_next t
LEFT JOIN counts c
  ON c.season = t.season
 AND c.week = t.week
 AND c.team_key = t.team_key
""")

print(con.execute("SELECT COUNT(*) AS rows FROM injury_counts_next").df())

We implement a one-week shift of the injury variables to create "prior health" controls, which allows the model to distinguish between a new spike in injuries and a persistent trend of poor health within a team

In [None]:
con.execute("DROP TABLE IF EXISTS injury_counts_with_lags")

con.execute("""
CREATE TABLE injury_counts_with_lags AS
SELECT
  season,
  week,
  team_key,
  has_next_week,
  Inj_Off_Next_w,
  Inj_Def_Next_w,
  Inj_Other_Next_w,
  COALESCE(
    LAG(Inj_Off_Next_w) OVER (PARTITION BY season, team_key ORDER BY week),
    0
  ) AS Inj_Off_Last_w,
  COALESCE(
    LAG(Inj_Def_Next_w) OVER (PARTITION BY season, team_key ORDER BY week),
    0
  ) AS Inj_Def_Last_w
FROM injury_counts_next
""")

print(con.execute("SELECT COUNT(*) AS rows FROM injury_counts_with_lags").df())

We add the newly calculated offensive and defensive injury counts, along with their respective health controls, back into the main dataset without creating duplicate records or losing observations

In [None]:
def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

con.register(
    "injury_outcomes_df",
    con.execute(f"""
      SELECT
        season,
        week,
        team_key AS {TEAM_ABBR_COL},
        Inj_Off_Next_w,
        Inj_Def_Next_w,
        Inj_Off_Last_w,
        Inj_Def_Last_w
      FROM injury_counts_with_lags
    """).df()
)

con.execute("DROP TABLE IF EXISTS injury_outcomes_tmp")
con.execute("""
CREATE TABLE injury_outcomes_tmp AS
SELECT * FROM injury_outcomes_df
""")

pre_rows = int(con.execute("SELECT COUNT(*) AS n FROM team_week_panel").df()["n"].iloc[0])

star = _star_excluding(
    "team_week_panel",
    "p",
    ["Inj_Off_Next_w", "Inj_Def_Next_w", "Inj_Off_Last_w", "Inj_Def_Last_w"]
)

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
SELECT
  {star},
  o.Inj_Off_Next_w,
  o.Inj_Def_Next_w,
  o.Inj_Off_Last_w,
  o.Inj_Def_Last_w
FROM team_week_panel p
LEFT JOIN injury_outcomes_tmp o
USING (season, week, {TEAM_ABBR_COL})
""")

post_rows = int(con.execute("SELECT COUNT(*) AS n FROM team_week_panel").df()["n"].iloc[0])

print("Rows before", pre_rows)
print("Rows after", post_rows)

if pre_rows != post_rows:
    raise RuntimeError("Row count changed after adding injury outcomes, investigate join keys or duplication")

Quick sanity check to confirm that injury outcome nulls only occur when week 'w+1' does not exist in the panel

In [None]:
null_check = con.execute("""
WITH has_next AS (
  SELECT
    a.season,
    a.week,
    a.team AS team_key,
    CASE WHEN b.week IS NULL THEN 0 ELSE 1 END AS has_next_week
  FROM team_week_panel a
  LEFT JOIN team_week_panel b
    ON b.season = a.season
   AND b.team = a.team
   AND b.week = a.week + 1
)
SELECT
  SUM(CASE WHEN h.has_next_week = 1 AND p.Inj_Off_Next_w IS NULL THEN 1 ELSE 0 END) AS bad_null_off,
  SUM(CASE WHEN h.has_next_week = 1 AND p.Inj_Def_Next_w IS NULL THEN 1 ELSE 0 END) AS bad_null_def,
  SUM(CASE WHEN p.Inj_Off_Last_w IS NULL THEN 1 ELSE 0 END) AS null_last_off,
  SUM(CASE WHEN p.Inj_Def_Last_w IS NULL THEN 1 ELSE 0 END) AS null_last_def
FROM team_week_panel p
JOIN has_next h
  ON h.season = p.season
 AND h.week = p.week
 AND h.team_key = p.team
""").df()

print(null_check)

bad_off = int(null_check["bad_null_off"].iloc[0])
bad_def = int(null_check["bad_null_def"].iloc[0])
null_last_off = int(null_check["null_last_off"].iloc[0])
null_last_def = int(null_check["null_last_def"].iloc[0])

if bad_off != 0 or bad_def != 0:
    raise RuntimeError("Inj_Off_Next_w or Inj_Def_Next_w has nulls even when next week exists")

if null_last_off != 0 or null_last_def != 0:
    raise RuntimeError("Inj_Off_Last_w or Inj_Def_Last_w contains nulls, lag controls should be defined as 0 when missing")

Quick sanity check to confirm that the output columns for new injury counts and health controls exist in the final dataframe and that their basic distributions follow the expected patterns for count-based football data

In [None]:
print(con.execute("DESCRIBE team_week_panel").df().tail(20))

diag = con.execute("""
SELECT
  AVG(Inj_Off_Next_w) AS mean_inj_off_next,
  AVG(Inj_Def_Next_w) AS mean_inj_def_next,
  AVG(Inj_Off_Last_w) AS mean_inj_off_last,
  AVG(Inj_Def_Last_w) AS mean_inj_def_last,
  SUM(CASE WHEN Inj_Off_Next_w IS NULL THEN 1 ELSE 0 END) AS null_off_next,
  SUM(CASE WHEN Inj_Def_Next_w IS NULL THEN 1 ELSE 0 END) AS null_def_next
FROM team_week_panel
""").df()

print(diag)

Quick sanity check to confirm that the number of next week outcome nulls matches the number of team weeks that have no week 'w+1' row in the panel

In [None]:
con.execute("""
WITH has_next AS (
  SELECT
    a.season,
    a.week,
    a.team,
    CASE WHEN b.week IS NULL THEN 0 ELSE 1 END AS has_next_week
  FROM team_week_panel a
  LEFT JOIN team_week_panel b
    ON b.season = a.season
   AND b.team = a.team
   AND b.week = a.week + 1
)
SELECT
  SUM(CASE WHEN has_next_week = 0 THEN 1 ELSE 0 END) AS no_next_week_rows
FROM has_next
""").df()