We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import duckdb
import pandas as pd
from pathlib import Path

cwd = Path.cwd()

root = None
for p in [cwd] + list(cwd.parents):
    if (p / "db").exists():
        root = p
        break

if root is None:
    raise FileNotFoundError("Could not find a db folder above the current working directory")

DB_PATH = root / "db" / "nflpa.duckdb"

print("Using DB_PATH", DB_PATH)

con = duckdb.connect(str(DB_PATH))

con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='4GB'")

Using DB_PATH /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


<_duckdb.DuckDBPyConnection at 0x11755e770>

We validate the prerequisites and also decide which team key column to use so that our joins in the upcoming modeling phase are perfectly aligned across different data sources

In [2]:
tables = set(con.execute("SHOW TABLES").df()["name"].tolist())

if "team_week_panel" not in tables:
    raise RuntimeError("team_week_panel missing, run notebooks 01 through 05 first")

panel_cols = con.execute("PRAGMA table_info('team_week_panel')").df()
panel_cols_list = panel_cols["name"].tolist()
panel_cols_set = set(panel_cols_list)

required_cols = [
    "season",
    "week",
    "ST_Load_All_w",
    "ST_Load_ScoreLinked_w",
    "ST_Load_NonScore_w",
    "ST_Shock_All_w",
    "ST_Shock_ScoreLinked_w",
    "ST_Shock_NonScore_w",
]
missing = [c for c in required_cols if c not in panel_cols_set]
if missing:
    raise RuntimeError(f"Missing required columns in team_week_panel, {missing}")

TEAM_COL = "team_id" if "team_id" in panel_cols_set else "team"
print("Using TEAM_COL", TEAM_COL)

def _existing_cols(table_name):
    return set(con.execute(f"PRAGMA table_info('{table_name}')").df()["name"].tolist())

def _star_excluding(table_name, alias, cols_to_maybe_exclude):
    existing = _existing_cols(table_name)
    keep = [c for c in cols_to_maybe_exclude if c in existing]
    if keep:
        return f"{alias}.* EXCLUDE ({', '.join(keep)})"
    return f"{alias}.*"

Using TEAM_COL team


Quick sanity check to confirm that we still have all the seasons and also weeks we started with and that the table hasn't been accidentally filtered during the recent processing steps

In [3]:
con.execute(f"""
SELECT
  season,
  COUNT(*) AS rows,
  COUNT(DISTINCT {TEAM_COL}) AS teams,
  MIN(week) AS min_week,
  MAX(week) AS max_week
FROM team_week_panel
GROUP BY season
ORDER BY season
""").df()

Unnamed: 0,season,rows,teams,min_week,max_week
0,2012,512,32,1,17
1,2013,512,32,1,17
2,2014,512,32,1,17
3,2015,512,32,1,17
4,2016,512,32,1,17
5,2017,512,32,1,17
6,2018,512,32,1,17
7,2019,512,32,1,17
8,2020,512,32,1,17
9,2021,544,32,1,18


Quick sanity check to find team seasons that do not have 17 games recorded

In [4]:
con.execute(f"""
SELECT
  season,
  {TEAM_COL} AS team,
  COUNT(*) AS n_games
FROM team_week_panel
GROUP BY season, {TEAM_COL}
HAVING season >= 2021 AND COUNT(*) <> 17
ORDER BY season, team
""").df()

Unnamed: 0,season,team,n_games
0,2022,BUF,16
1,2022,CIN,16


Quick sanity check to find which weeks are missing for those teams

In [5]:
con.execute(f"""
WITH team_counts AS (
  SELECT
    season,
    {TEAM_COL} AS team,
    COUNT(*) AS n_games
  FROM team_week_panel
  GROUP BY season, {TEAM_COL}
  HAVING season = 2022 AND COUNT(*) <> 17
),
expected_weeks AS (
  SELECT 2022 AS season, w AS week
  FROM range(1, 19) t(w)
),
team_expected AS (
  SELECT tc.team, ew.season, ew.week
  FROM team_counts tc
  CROSS JOIN expected_weeks ew
),
team_actual AS (
  SELECT season, week, {TEAM_COL} AS team
  FROM team_week_panel
  WHERE season = 2022
)
SELECT
  te.team,
  te.week AS missing_week
FROM team_expected te
LEFT JOIN team_actual ta
  ON te.season = ta.season
 AND te.week = ta.week
 AND te.team = ta.team
WHERE ta.team IS NULL
ORDER BY te.team, te.week
""").df()

Unnamed: 0,team,missing_week
0,BUF,7
1,BUF,17
2,CIN,10
3,CIN,17


Quick sanity check to confirm that the BUF and CIN schedule rows you are inspecting are not restricted to regular season games

In [6]:
con.execute("""
SELECT
  season,
  week,
  game_id,
  home_team,
  away_team,
  home_score,
  away_score
FROM schedules
WHERE season = 2022
  AND (home_team IN ('BUF','CIN') OR away_team IN ('BUF','CIN'))
ORDER BY week, game_id
""").df()

Unnamed: 0,season,week,game_id,home_team,away_team,home_score,away_score
0,2022,1,2022_01_BUF_LA,LA,BUF,10,31
1,2022,1,2022_01_PIT_CIN,CIN,PIT,20,23
2,2022,2,2022_02_CIN_DAL,DAL,CIN,20,17
3,2022,2,2022_02_TEN_BUF,BUF,TEN,41,7
4,2022,3,2022_03_BUF_MIA,MIA,BUF,21,19
5,2022,3,2022_03_CIN_NYJ,NYJ,CIN,12,27
6,2022,4,2022_04_BUF_BAL,BAL,BUF,20,23
7,2022,4,2022_04_MIA_CIN,CIN,MIA,27,15
8,2022,5,2022_05_CIN_BAL,BAL,CIN,19,17
9,2022,5,2022_05_PIT_BUF,BUF,PIT,38,3


Quick sanity check to confirm whether a BUF versus CIN matchup row exists in your schedules table for season 2022

In [7]:
con.execute("""
SELECT
  season,
  week,
  game_id,
  home_team,
  away_team,
  home_score,
  away_score
FROM schedules
WHERE season = 2022
  AND (
    (home_team = 'BUF' AND away_team = 'CIN')
    OR
    (home_team = 'CIN' AND away_team = 'BUF')
  )
ORDER BY week, game_id
""").df()

Unnamed: 0,season,week,game_id,home_team,away_team,home_score,away_score
0,2022,20,2022_20_CIN_BUF,BUF,CIN,10,27


We compute season-to-date volatility measures for each special teams workload bucket and also ensure that these rolling statistics capture how much a team's special teams usage fluctuates as the season progresses

In [8]:
cols_to_replace_optional = [
    "ST_Games_ToDate_w",
    "ST_Vol_All_w",
    "ST_Vol_ScoreLinked_w",
    "ST_Vol_NonScore_w",
]

star = _star_excluding("team_week_panel", "base", cols_to_replace_optional + [
    "_st_n_to_date",
    "_st_vol_all_raw",
    "_st_vol_scorelinked_raw",
    "_st_vol_nonscore_raw",
])

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
WITH base AS (
  SELECT
    p.*,

    COUNT(*) OVER w AS _st_n_to_date,

    STDDEV_SAMP(ST_Load_All_w) OVER w AS _st_vol_all_raw,
    STDDEV_SAMP(ST_Load_ScoreLinked_w) OVER w AS _st_vol_scorelinked_raw,
    STDDEV_SAMP(ST_Load_NonScore_w) OVER w AS _st_vol_nonscore_raw

  FROM team_week_panel p
  WINDOW w AS (
    PARTITION BY season, {TEAM_COL}
    ORDER BY week
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  )
)
SELECT
  {star},

  _st_n_to_date AS ST_Games_ToDate_w,

  CASE
    WHEN _st_n_to_date < 2 THEN 0
    ELSE COALESCE(_st_vol_all_raw, 0)
  END AS ST_Vol_All_w,

  CASE
    WHEN _st_n_to_date < 2 THEN 0
    ELSE COALESCE(_st_vol_scorelinked_raw, 0)
  END AS ST_Vol_ScoreLinked_w,

  CASE
    WHEN _st_n_to_date < 2 THEN 0
    ELSE COALESCE(_st_vol_nonscore_raw, 0)
  END AS ST_Vol_NonScore_w

FROM base
""")

<_duckdb.DuckDBPyConnection at 0x11755e770>

Quick sanity check to confirm that our new volatility calculations didn't result in empty data points and also checking that even the early-season weeks have a default or starting volatility value assigned

In [9]:
con.execute(f"""
SELECT
  SUM(CASE WHEN ST_Games_ToDate_w IS NULL THEN 1 ELSE 0 END) AS n_null_games_to_date,
  SUM(CASE WHEN ST_Vol_All_w IS NULL THEN 1 ELSE 0 END) AS n_null_vol_all,
  SUM(CASE WHEN ST_Vol_ScoreLinked_w IS NULL THEN 1 ELSE 0 END) AS n_null_vol_scorelinked,
  SUM(CASE WHEN ST_Vol_NonScore_w IS NULL THEN 1 ELSE 0 END) AS n_null_vol_nonscore
FROM team_week_panel
""").df()

Unnamed: 0,n_null_games_to_date,n_null_vol_all,n_null_vol_scorelinked,n_null_vol_nonscore
0,0.0,0.0,0.0,0.0


Quick sanity check to confirm that the volatility starts at exactly zero for every team's first game of the season and also ensuring that our rolling standard deviation logic doesn't inherit values from the previous year

In [10]:
con.execute(f"""
WITH first_games AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    MIN(week) AS first_week
  FROM team_week_panel
  GROUP BY season, {TEAM_COL}
)
SELECT
  COUNT(*) AS first_game_rows,
  SUM(CASE WHEN p.ST_Vol_All_w = 0 THEN 1 ELSE 0 END) AS vol_all_zero_on_first,
  SUM(CASE WHEN p.ST_Vol_ScoreLinked_w = 0 THEN 1 ELSE 0 END) AS vol_scorelinked_zero_on_first,
  SUM(CASE WHEN p.ST_Vol_NonScore_w = 0 THEN 1 ELSE 0 END) AS vol_nonscore_zero_on_first
FROM team_week_panel p
JOIN first_games f
  ON p.season = f.season
 AND p.{TEAM_COL} = f.team_key
 AND p.week = f.first_week
""").df()

Unnamed: 0,first_game_rows,vol_all_zero_on_first,vol_scorelinked_zero_on_first,vol_nonscore_zero_on_first
0,416,416.0,416.0,416.0


We compute cumulative shock counts per bucket from week 1 through week w for each team season and also verify that the running total never decreases as the season progresses

In [11]:
cols_to_replace_optional = [
    "Cum_Shocks_All_w",
    "Cum_Shocks_ScoreLinked_w",
    "Cum_Shocks_NonScore_w",
]

star = _star_excluding("team_week_panel", "base", cols_to_replace_optional + [
    "_cum_all",
    "_cum_scorelinked",
    "_cum_nonscore",
])

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
WITH base AS (
  SELECT
    p.*,

    SUM(ST_Shock_All_w) OVER w AS _cum_all,
    SUM(ST_Shock_ScoreLinked_w) OVER w AS _cum_scorelinked,
    SUM(ST_Shock_NonScore_w) OVER w AS _cum_nonscore

  FROM team_week_panel p
  WINDOW w AS (
    PARTITION BY season, {TEAM_COL}
    ORDER BY week
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  )
)
SELECT
  {star},

  COALESCE(_cum_all, 0) AS Cum_Shocks_All_w,
  COALESCE(_cum_scorelinked, 0) AS Cum_Shocks_ScoreLinked_w,
  COALESCE(_cum_nonscore, 0) AS Cum_Shocks_NonScore_w

FROM base
""")

<_duckdb.DuckDBPyConnection at 0x11755e770>

Quick sanity check to confirm that the total count of shocks never drops below zero and also ensuring that no data corruption or subtraction errors occurred during the aggregation process

In [12]:
con.execute(f"""
SELECT
  SUM(CASE WHEN Cum_Shocks_All_w < 0 THEN 1 ELSE 0 END) AS n_negative_all,
  SUM(CASE WHEN Cum_Shocks_ScoreLinked_w < 0 THEN 1 ELSE 0 END) AS n_negative_scorelinked,
  SUM(CASE WHEN Cum_Shocks_NonScore_w < 0 THEN 1 ELSE 0 END) AS n_negative_nonscore
FROM team_week_panel
""").df()

Unnamed: 0,n_negative_all,n_negative_scorelinked,n_negative_nonscore
0,0.0,0.0,0.0


Quick sanity check to confirm that the count of shocks only ever stays the same or goes up as we move from week to week and also ensuring no "reset" logic is accidentally triggering in the middle of a season

In [13]:
con.execute(f"""
WITH chk AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    week,

    Cum_Shocks_All_w,
    LAG(Cum_Shocks_All_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_all,

    Cum_Shocks_ScoreLinked_w,
    LAG(Cum_Shocks_ScoreLinked_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_scorelinked,

    Cum_Shocks_NonScore_w,
    LAG(Cum_Shocks_NonScore_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week) AS prev_nonscore
  FROM team_week_panel
)
SELECT
  SUM(CASE WHEN prev_all IS NOT NULL AND Cum_Shocks_All_w < prev_all THEN 1 ELSE 0 END) AS n_decreasing_all,
  SUM(CASE WHEN prev_scorelinked IS NOT NULL AND Cum_Shocks_ScoreLinked_w < prev_scorelinked THEN 1 ELSE 0 END) AS n_decreasing_scorelinked,
  SUM(CASE WHEN prev_nonscore IS NOT NULL AND Cum_Shocks_NonScore_w < prev_nonscore THEN 1 ELSE 0 END) AS n_decreasing_nonscore
FROM chk
""").df()

Unnamed: 0,n_decreasing_all,n_decreasing_scorelinked,n_decreasing_nonscore
0,0.0,0.0,0.0


We create a three-week lookback window of shock events and also verify that the lag features correctly represent the state of the team in the prior weeks rather than pulling data from the future

In [14]:
cols_to_replace_optional = [
    "ST_Shock_All_w_minus_1",
    "ST_Shock_All_w_minus_2",
    "ST_Shock_All_w_minus_3",
    "ST_Shock_ScoreLinked_w_minus_1",
    "ST_Shock_ScoreLinked_w_minus_2",
    "ST_Shock_ScoreLinked_w_minus_3",
    "ST_Shock_NonScore_w_minus_1",
    "ST_Shock_NonScore_w_minus_2",
    "ST_Shock_NonScore_w_minus_3",
]

helper_cols = [
    "_lag_all_1", "_lag_all_2", "_lag_all_3",
    "_lag_scorelinked_1", "_lag_scorelinked_2", "_lag_scorelinked_3",
    "_lag_nonscore_1", "_lag_nonscore_2", "_lag_nonscore_3",
]

star = _star_excluding("team_week_panel", "base", cols_to_replace_optional + helper_cols)

con.execute(f"""
CREATE OR REPLACE TABLE team_week_panel AS
WITH base AS (
  SELECT
    p.*,

    LAG(ST_Shock_All_w, 1) OVER w AS _lag_all_1,
    LAG(ST_Shock_All_w, 2) OVER w AS _lag_all_2,
    LAG(ST_Shock_All_w, 3) OVER w AS _lag_all_3,

    LAG(ST_Shock_ScoreLinked_w, 1) OVER w AS _lag_scorelinked_1,
    LAG(ST_Shock_ScoreLinked_w, 2) OVER w AS _lag_scorelinked_2,
    LAG(ST_Shock_ScoreLinked_w, 3) OVER w AS _lag_scorelinked_3,

    LAG(ST_Shock_NonScore_w, 1) OVER w AS _lag_nonscore_1,
    LAG(ST_Shock_NonScore_w, 2) OVER w AS _lag_nonscore_2,
    LAG(ST_Shock_NonScore_w, 3) OVER w AS _lag_nonscore_3

  FROM team_week_panel p
  WINDOW w AS (
    PARTITION BY season, {TEAM_COL}
    ORDER BY week
  )
)
SELECT
  {star},

  COALESCE(_lag_all_1, 0) AS ST_Shock_All_w_minus_1,
  COALESCE(_lag_all_2, 0) AS ST_Shock_All_w_minus_2,
  COALESCE(_lag_all_3, 0) AS ST_Shock_All_w_minus_3,

  COALESCE(_lag_scorelinked_1, 0) AS ST_Shock_ScoreLinked_w_minus_1,
  COALESCE(_lag_scorelinked_2, 0) AS ST_Shock_ScoreLinked_w_minus_2,
  COALESCE(_lag_scorelinked_3, 0) AS ST_Shock_ScoreLinked_w_minus_3,

  COALESCE(_lag_nonscore_1, 0) AS ST_Shock_NonScore_w_minus_1,
  COALESCE(_lag_nonscore_2, 0) AS ST_Shock_NonScore_w_minus_2,
  COALESCE(_lag_nonscore_3, 0) AS ST_Shock_NonScore_w_minus_3

FROM base
""")

<_duckdb.DuckDBPyConnection at 0x11755e770>

Quick sanity check to confirm that the values in the lag columns match the shock flags from the previous one and two and three weeks while using a recent season to spot-check the row-to-row movement

In [15]:
con.execute(f"""
SELECT
  season,
  week,
  {TEAM_COL} AS team_key,

  ST_Shock_NonScore_w,
  ST_Shock_NonScore_w_minus_1,
  ST_Shock_NonScore_w_minus_2,
  ST_Shock_NonScore_w_minus_3,

  ST_Shock_All_w,
  ST_Shock_All_w_minus_1,
  ST_Shock_All_w_minus_2,
  ST_Shock_All_w_minus_3,

  ST_Shock_ScoreLinked_w,
  ST_Shock_ScoreLinked_w_minus_1,
  ST_Shock_ScoreLinked_w_minus_2,
  ST_Shock_ScoreLinked_w_minus_3

FROM team_week_panel
WHERE season = (SELECT MAX(season) FROM team_week_panel)
ORDER BY team_key, week
LIMIT 120
""").df()

Unnamed: 0,season,week,team_key,ST_Shock_NonScore_w,ST_Shock_NonScore_w_minus_1,ST_Shock_NonScore_w_minus_2,ST_Shock_NonScore_w_minus_3,ST_Shock_All_w,ST_Shock_All_w_minus_1,ST_Shock_All_w_minus_2,ST_Shock_All_w_minus_3,ST_Shock_ScoreLinked_w,ST_Shock_ScoreLinked_w_minus_1,ST_Shock_ScoreLinked_w_minus_2,ST_Shock_ScoreLinked_w_minus_3
0,2024,1,ARI,0,0,0,0,0,0,0,0,0,0,0,0
1,2024,2,ARI,0,0,0,0,0,0,0,0,0,0,0,0
2,2024,3,ARI,0,0,0,0,0,0,0,0,0,0,0,0
3,2024,4,ARI,0,0,0,0,0,0,0,0,0,0,0,0
4,2024,5,ARI,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2024,15,CIN,0,0,0,1,0,0,0,1,0,0,1,1
116,2024,16,CIN,0,0,0,0,0,0,0,0,0,0,0,1
117,2024,17,CIN,0,0,0,0,0,0,0,0,0,0,0,0
118,2024,18,CIN,0,0,0,0,0,0,0,0,0,0,0,0


Quick sanity check to confirm that the lag features saved in our table are identical to fresh calculations and also that the shifting logic remains consistent across every row in the dataset

In [16]:
con.execute(f"""
WITH chk AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    week,

    ST_Shock_All_w_minus_1,
    ST_Shock_All_w_minus_2,
    ST_Shock_All_w_minus_3,

    ST_Shock_ScoreLinked_w_minus_1,
    ST_Shock_ScoreLinked_w_minus_2,
    ST_Shock_ScoreLinked_w_minus_3,

    ST_Shock_NonScore_w_minus_1,
    ST_Shock_NonScore_w_minus_2,
    ST_Shock_NonScore_w_minus_3,

    COALESCE(LAG(ST_Shock_All_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS all_l1,
    COALESCE(LAG(ST_Shock_All_w, 2) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS all_l2,
    COALESCE(LAG(ST_Shock_All_w, 3) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS all_l3,

    COALESCE(LAG(ST_Shock_ScoreLinked_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS sl_l1,
    COALESCE(LAG(ST_Shock_ScoreLinked_w, 2) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS sl_l2,
    COALESCE(LAG(ST_Shock_ScoreLinked_w, 3) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS sl_l3,

    COALESCE(LAG(ST_Shock_NonScore_w, 1) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS ns_l1,
    COALESCE(LAG(ST_Shock_NonScore_w, 2) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS ns_l2,
    COALESCE(LAG(ST_Shock_NonScore_w, 3) OVER (PARTITION BY season, {TEAM_COL} ORDER BY week), 0) AS ns_l3

  FROM team_week_panel
)
SELECT
  SUM(CASE WHEN ST_Shock_All_w_minus_1 <> all_l1 THEN 1 ELSE 0 END) AS mismatch_all_l1,
  SUM(CASE WHEN ST_Shock_All_w_minus_2 <> all_l2 THEN 1 ELSE 0 END) AS mismatch_all_l2,
  SUM(CASE WHEN ST_Shock_All_w_minus_3 <> all_l3 THEN 1 ELSE 0 END) AS mismatch_all_l3,

  SUM(CASE WHEN ST_Shock_ScoreLinked_w_minus_1 <> sl_l1 THEN 1 ELSE 0 END) AS mismatch_sl_l1,
  SUM(CASE WHEN ST_Shock_ScoreLinked_w_minus_2 <> sl_l2 THEN 1 ELSE 0 END) AS mismatch_sl_l2,
  SUM(CASE WHEN ST_Shock_ScoreLinked_w_minus_3 <> sl_l3 THEN 1 ELSE 0 END) AS mismatch_sl_l3,

  SUM(CASE WHEN ST_Shock_NonScore_w_minus_1 <> ns_l1 THEN 1 ELSE 0 END) AS mismatch_ns_l1,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_2 <> ns_l2 THEN 1 ELSE 0 END) AS mismatch_ns_l2,
  SUM(CASE WHEN ST_Shock_NonScore_w_minus_3 <> ns_l3 THEN 1 ELSE 0 END) AS mismatch_ns_l3
FROM chk
""").df()

Unnamed: 0,mismatch_all_l1,mismatch_all_l2,mismatch_all_l3,mismatch_sl_l1,mismatch_sl_l2,mismatch_sl_l3,mismatch_ns_l1,mismatch_ns_l2,mismatch_ns_l3
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Quick sanity check to confirm that every team has exactly one record for each week they played and also that our joins haven't accidentally doubled any rows in the final panel

In [17]:
con.execute(f"""
WITH keyed AS (
  SELECT
    season,
    {TEAM_COL} AS team_key,
    week,
    COUNT(*) AS n_rows
  FROM team_week_panel
  GROUP BY season, {TEAM_COL}, week
),
dups AS (
  SELECT *
  FROM keyed
  WHERE n_rows > 1
)
SELECT
  (SELECT COUNT(*) FROM team_week_panel) AS total_rows,
  (SELECT COUNT(*) FROM keyed) AS distinct_keys,
  (SELECT COUNT(*) FROM dups) AS n_duplicate_keys,
  (SELECT COALESCE(SUM(n_rows - 1), 0) FROM dups) AS n_extra_rows_from_dups
""").df()

Unnamed: 0,total_rows,distinct_keys,n_duplicate_keys,n_extra_rows_from_dups
0,6782,6782,0,0.0
