In [1]:
# 📦 Imports
import pandas as pd
import glob

# 📁 Load raw Statcast data
files = sorted(glob.glob("../data/raw/statcast_*.csv"))
dfs = [pd.read_csv(file, low_memory=False) for file in files]
df_raw = pd.concat(dfs, ignore_index=True)

# ✅ Basic cleanup + setup
df_raw['season'] = pd.to_datetime(df_raw['game_date']).dt.year
df_raw = df_raw.dropna(subset=['pitcher', 'player_name', 'home_team', 'pitch_type', 'release_speed'])
df_raw['team'] = df_raw['player_name'].where(df_raw['inning'] == 1, df_raw['home_team'])  # crude fallback

# 📁 Load velocity grades
grades = pd.read_csv("../data/processed/pitcher_velocity_grades.csv")

# 🔗 Merge grades with full raw data
df = grades.merge(df_raw[['season', 'pitcher', 'player_name', 'team', 'pitch_type']], 
                  on=['season', 'pitcher', 'pitch_type'], how='left')

# 🎯 Filter to only Grade A pitches
df = df[df['velocity_grade'] == 'A'].dropna(subset=['player_name', 'team'])

# 📊 Part 1: Pitcher rankings within team
pitcher_team_rank = (
    df.groupby(['season', 'team', 'pitcher', 'player_name'])
    .size()
    .reset_index(name='grade_a_count')
    .sort_values(['season', 'team', 'grade_a_count'], ascending=[True, True, False])
)

# 📊 Part 2: Total Grade A count per team (staff strength)
team_staff_rank = (
    pitcher_team_rank.groupby(['season', 'team'])['grade_a_count']
    .sum()
    .reset_index(name='team_total_grade_a')
    .sort_values(['season', 'team_total_grade_a'], ascending=[True, False])
)

# 💾 Save both
pitcher_team_rank.to_csv("../data/processed/pitcher_rankings_by_team.csv", index=False)
team_staff_rank.to_csv("../data/processed/team_staff_strength.csv", index=False)

print("✅ Saved pitcher and team pitching rankings.")


✅ Saved pitcher and team pitching rankings.
