We initialize Python imports and opens a DuckDB connection that every later cell reuses

In [1]:
import os
from pathlib import Path
import duckdb

print("cwd", Path().resolve())

candidates = []
search_roots = [
    Path("."),
    Path(".."),
    Path("./data"),
    Path("../data"),
    Path("../../data"),
]
for root in search_roots:
    if root.exists():
        candidates.extend(list(root.glob("*.duckdb")))
        candidates.extend(list(root.glob("**/*.duckdb")))

seen = set()
duckdb_files = []
for f in candidates:
    fp = str(f.resolve())
    if fp not in seen:
        seen.add(fp)
        duckdb_files.append(f.resolve())

print("duckdb files found")
for i, f in enumerate(duckdb_files[:25]):
    print(i, f)

db_file = None
for f in duckdb_files:
    if f.name == "nflpa.duckdb":
        db_file = f
        break

if db_file is None and duckdb_files:
    db_file = duckdb_files[0]

if db_file is None:
    raise RuntimeError("No duckdb file found near this notebook, rerun notebook 00 or check where you stored the database file")

con = duckdb.connect(str(db_file))
print("connected db", db_file)

tables = con.execute("SHOW TABLES").df()
tables

cwd /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/notebooks
duckdb files found
0 /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb
1 /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/data/nflpa.duckdb
connected db /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


Unnamed: 0,name
0,injuries
1,injuries_players_distinct
2,injuries_team_week_players
3,injury_counts_next
4,injury_counts_next_game
5,injury_counts_next_game_with_lags
6,injury_counts_with_lags
7,injury_outcomes_next_game_tmp
8,injury_outcomes_tmp
9,injury_players_next_game


Quick sanity check to confirm that 'team_week_panel' exists in the connected database 

In [2]:
con.execute("""
SELECT
  SUM(CASE WHEN name = 'team_week_panel' THEN 1 ELSE 0 END) AS has_team_week_panel
FROM (SHOW TABLES)
""").df()

Unnamed: 0,has_team_week_panel
0,1.0


Quick sanity check to confirm that 'team_week_panel' has no duplicate season-week-team rows and that the key count equals the row count to ensure that the final modeling table is a perfectly unique panel

In [3]:
con.execute("""
SELECT
  COUNT(*) AS rows,
  COUNT(DISTINCT season || '-' || week || '-' || team) AS distinct_keys
FROM team_week_panel
""").df()

dups = con.execute("""
SELECT
  season,
  week,
  team,
  COUNT(*) AS n
FROM team_week_panel
GROUP BY 1,2,3
HAVING COUNT(*) > 1
ORDER BY n DESC, season, week, team
LIMIT 50
""").df()

dups

Unnamed: 0,season,week,team,n
