In [1]:
from pathlib import Path
import duckdb

DB_DIR = Path("../db")
DB_DIR.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(str(DB_DIR / "nflpa.duckdb"))

print("db file", (DB_DIR / "nflpa.duckdb").resolve())

db file /Users/ramko/Desktop/2025-26-NFLPA-Data-Analytics-Case-Competition/db/nflpa.duckdb


In [2]:
con.execute("CREATE OR REPLACE VIEW pbp AS SELECT * FROM read_parquet('../data/raw/pbp.parquet')")
con.execute("CREATE OR REPLACE VIEW schedules AS SELECT * FROM read_parquet('../data/raw/schedules.parquet')")
con.execute("CREATE OR REPLACE VIEW injuries AS SELECT * FROM read_parquet('../data/raw/injuries.parquet')")
con.execute("CREATE OR REPLACE VIEW snap_counts AS SELECT * FROM read_parquet('../data/raw/snap_counts.parquet')")
con.execute("CREATE OR REPLACE VIEW players AS SELECT * FROM read_parquet('../data/raw/players.parquet')")
con.execute("CREATE OR REPLACE VIEW rosters_weekly AS SELECT * FROM read_parquet('../data/raw/rosters_weekly.parquet')")

print("views created")

views created


In [3]:
for name in ["pbp","schedules","injuries","snap_counts","players"]:
    n = con.execute(f"SELECT COUNT(*) FROM {name}").fetchone()[0]
    print(name, n)

pbp 627226
schedules 3544
injuries 70401
snap_counts 297999
players 24350


Quick sanity checks

In [4]:
con.execute("""
SELECT play_type, COUNT(*) AS n
FROM pbp
WHERE play_type IS NOT NULL
GROUP BY 1
ORDER BY n DESC
LIMIT 50
""").df()

Unnamed: 0,play_type,n
0,pass,262139
1,run,184080
2,no_play,58822
3,kickoff,36301
4,punt,30672
5,extra_point,16659
6,field_goal,13741
7,qb_kneel,5464
8,qb_spike,924


In [5]:
con.execute("DESCRIBE injuries").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,season,DOUBLE,YES,,,
1,game_type,VARCHAR,YES,,,
2,team,VARCHAR,YES,,,
3,week,DOUBLE,YES,,,
4,gsis_id,VARCHAR,YES,,,
5,position,VARCHAR,YES,,,
6,full_name,VARCHAR,YES,,,
7,first_name,VARCHAR,YES,,,
8,last_name,VARCHAR,YES,,,
9,report_primary_injury,VARCHAR,YES,,,


In [6]:
con.execute("""
SELECT season, week, team, COUNT(*) AS rows
FROM injuries
GROUP BY 1,2,3
ORDER BY season DESC, week DESC
LIMIT 20
""").df()

Unnamed: 0,season,week,team,rows
0,2024.0,22.0,KC,4
1,2024.0,22.0,PHI,11
2,2024.0,21.0,KC,4
3,2024.0,21.0,WAS,18
4,2024.0,21.0,PHI,11
5,2024.0,21.0,BUF,12
6,2024.0,20.0,LA,7
7,2024.0,20.0,DET,6
8,2024.0,20.0,BUF,12
9,2024.0,20.0,HOU,11


In [7]:
con.execute("""
SELECT season, week, game_id, home_team, away_team
FROM schedules
ORDER BY season DESC, week DESC
LIMIT 10
""").df()

Unnamed: 0,season,week,game_id,home_team,away_team
0,2024,22,2024_22_KC_PHI,PHI,KC
1,2024,21,2024_21_WAS_PHI,PHI,WAS
2,2024,21,2024_21_BUF_KC,KC,BUF
3,2024,20,2024_20_HOU_KC,KC,HOU
4,2024,20,2024_20_WAS_DET,DET,WAS
5,2024,20,2024_20_LA_PHI,PHI,LA
6,2024,20,2024_20_BAL_BUF,BUF,BAL
7,2024,19,2024_19_PIT_BAL,BAL,PIT
8,2024,19,2024_19_GB_PHI,PHI,GB
9,2024,19,2024_19_MIN_LA,LA,MIN


In [8]:
con.execute("DESCRIBE rosters_weekly").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,season,INTEGER,YES,,,
1,team,VARCHAR,YES,,,
2,position,VARCHAR,YES,,,
3,depth_chart_position,VARCHAR,YES,,,
4,jersey_number,VARCHAR,YES,,,
5,status,VARCHAR,YES,,,
6,full_name,VARCHAR,YES,,,
7,first_name,VARCHAR,YES,,,
8,last_name,VARCHAR,YES,,,
9,birth_date,DATE,YES,,,
