In [3]:
# 📦 Imports
import pandas as pd
import glob

# 📁 Load raw Statcast files again
files = sorted(glob.glob("../data/raw/statcast_*.csv"))
dfs = [pd.read_csv(file, low_memory=False) for file in files]
df = pd.concat(dfs, ignore_index=True)

# 📁 Load velocity grades
grades = pd.read_csv("../data/processed/pitcher_velocity_grades.csv")

# 🧼 Drop rows with missing pitcher/team/inning
df = df.dropna(subset=['pitcher', 'inning'])

# ✅ Identify game ID
df['game_id'] = df['game_date'].astype(str) + "_" + df['home_team'] + "_" + df['away_team']

# ✅ Sort and find first pitch per game for each team
df = df.sort_values(by=['game_id', 'inning', 'pitch_number'])
first_pitchers = (
    df.groupby(['game_id'])
    .first()
    .reset_index()[['game_id', 'pitcher']]
)

# 🧮 Count how many times each pitcher opened a game (i.e., starter)
starter_counts = first_pitchers['pitcher'].value_counts().reset_index()
starter_counts.columns = ['pitcher', 'starts']

# 🔢 Count total appearances per pitcher
total_counts = df['pitcher'].value_counts().reset_index()
total_counts.columns = ['pitcher', 'total']

# 🧠 Determine roles
roles = pd.merge(total_counts, starter_counts, on='pitcher', how='left').fillna(0)
roles['starts'] = roles['starts'].astype(int)
roles['role'] = roles.apply(lambda row: 'Starter' if row['starts'] >= 5 else 'Reliever', axis=1)

# 🧾 Assign team
pitcher_teams = df.groupby('pitcher')['home_team'].first().reset_index()
pitcher_teams.columns = ['pitcher', 'team']

# 🔗 Merge everything into velocity grades
grades = grades.merge(pitcher_teams, on='pitcher', how='left')
grades = grades.merge(roles[['pitcher', 'role']], on='pitcher', how='left')

# 📊 Team-level summary
team_summary = grades.groupby(['team', 'role', 'velocity_grade']).size().unstack(fill_value=0)

# 💾 Save output
team_summary.to_csv("../data/processed/team_pitching_summary.csv")
print("✅ Saved team summary to ../data/processed/team_pitching_summary.csv")

# 🔍 Preview
team_summary.head()


✅ Saved team summary to ../data/processed/team_pitching_summary.csv


Unnamed: 0_level_0,velocity_grade,A,B,C,D,F
team,role,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATH,Reliever,5,3,7,3,0
ATL,Reliever,31,21,53,19,46
ATL,Starter,22,7,29,12,14
AZ,Reliever,40,23,64,27,61
AZ,Starter,7,10,25,3,25


In [4]:
print(grades['role'].value_counts())


role
Reliever    4827
Starter     2123
Name: count, dtype: int64
