In [2]:
# 📦 Imports
import pandas as pd
import glob

# 📁 Load all raw data files again
files = sorted(glob.glob("../data/raw/statcast_*.csv"))
dfs = [pd.read_csv(file, low_memory=False) for file in files]
df = pd.concat(dfs, ignore_index=True)

# 🧼 Drop rows with missing velocity or pitch type
df = df.dropna(subset=['release_speed', 'pitch_type', 'pitcher'])

# ✅ Add season column
df['season'] = pd.to_datetime(df['game_date']).dt.year

# 🎯 Step 1: Average velocity per pitcher, pitch type, and season
pitcher_vel = df.groupby(['season', 'pitcher', 'pitch_type'])['release_speed'].mean().reset_index()
pitcher_vel.rename(columns={'release_speed': 'avg_speed'}, inplace=True)

# 🎯 Step 2: League average velocity per pitch type per season
league_avg = df.groupby(['season', 'pitch_type'])['release_speed'].mean().reset_index()
league_avg.rename(columns={'release_speed': 'league_avg_speed'}, inplace=True)

# 🔗 Merge to compare
merged = pitcher_vel.merge(league_avg, on=['season', 'pitch_type'])
merged['velocity_diff'] = merged['avg_speed'] - merged['league_avg_speed']

# 🧮 Step 3: Assign letter grades based on velocity difference
def grade_pitch(diff):
    if diff > 2:
        return 'A'
    elif diff > 1:
        return 'B'
    elif diff >= -1:
        return 'C'
    elif diff >= -2:
        return 'D'
    else:
        return 'F'

merged['velocity_grade'] = merged['velocity_diff'].apply(grade_pitch)

# 💾 Save to processed
merged.to_csv("../data/processed/pitcher_velocity_grades.csv", index=False)
print("✅ Saved to ../data/processed/pitcher_velocity_grades.csv (with season)")

# 🔍 Preview
merged.head()


✅ Saved to ../data/processed/pitcher_velocity_grades.csv (with season)


Unnamed: 0,season,pitcher,pitch_type,avg_speed,league_avg_speed,velocity_diff,velocity_grade
0,2022,405395,CU,53.325,78.699901,-25.374901,F
1,2022,405395,FA,62.373684,68.691144,-6.31746,F
2,2022,424144,FF,88.75,93.922772,-5.172772,F
3,2022,424144,SI,88.345455,93.328176,-4.982721,F
4,2022,424144,SL,76.581481,85.105727,-8.524246,F
