In [2]:
import pandas as pd
import glob


# 📁 Load all raw Statcast data and combine seasons
files = sorted(glob.glob("../data/raw/statcast_*.csv"))
dfs = [pd.read_csv(file, low_memory=False) for file in files]
df_raw = pd.concat(dfs, ignore_index=True)

# 🎯 Add season and drop missing
df_raw['season'] = pd.to_datetime(df_raw['game_date']).dt.year
df = df_raw.dropna(subset=['release_speed', 'pitch_type', 'pitcher', 'player_name'])

# 🎯 Step 1: League avg velocity per pitch type
league_avg = df.groupby('pitch_type')['release_speed'].mean().reset_index()
league_avg.rename(columns={'release_speed': 'league_avg_speed'}, inplace=True)

# 🎯 Step 2: Pitcher avg velocity by pitch type
pitcher_vel = df.groupby(['pitcher', 'player_name', 'pitch_type'])['release_speed'].mean().reset_index()
pitcher_vel.rename(columns={'release_speed': 'avg_speed'}, inplace=True)

# 🔗 Step 3: Compare to league
merged = pitcher_vel.merge(league_avg, on='pitch_type')
merged['velocity_diff'] = merged['avg_speed'] - merged['league_avg_speed']

# 🎓 Grade function
def grade_pitch(diff):
    if diff > 2:
        return 'A'
    elif diff > 1:
        return 'B'
    elif diff >= -1:
        return 'C'
    elif diff >= -2:
        return 'D'
    else:
        return 'F'

merged['grade'] = merged['velocity_diff'].apply(grade_pitch)

# 🎯 Step 4: Top N pitchers per pitch type by velocity_diff
top_pitchers_by_type = (
    merged.sort_values(['pitch_type', 'velocity_diff'], ascending=[True, False])
    .groupby('pitch_type')
    .head(5)
    .reset_index(drop=True)
)

# 💾 Save
top_pitchers_by_type.to_csv("../data/processed/top_pitchers_by_pitch_type.csv", index=False)
print("✅ Saved: top_pitchers_by_pitch_type.csv")

# 🔍 Preview
top_pitchers_by_type.head(10)


✅ Saved: top_pitchers_by_pitch_type.csv


Unnamed: 0,pitcher,player_name,pitch_type,avg_speed,league_avg_speed,velocity_diff,grade
0,672841,"Vargas, Carlos",CH,93.9,85.438463,8.461537,A
1,676689,"Karcher, Ricky",CH,93.9,85.438463,8.461537,A
2,664126,"Fairbanks, Pete",CH,93.818,85.438463,8.379537,A
3,629498,"Quezada, Johan",CH,93.8,85.438463,8.361537,A
4,666619,"Santos, Gregory",CH,93.436364,85.438463,7.9979,A
5,571760,"Heaney, Andrew",CS,73.40597,64.846409,8.559561,A
6,543243,"Gray, Sonny",CS,73.26,64.846409,8.413591,A
7,608648,"Duffey, Tyler",CS,73.1875,64.846409,8.341091,A
8,607625,"Lugo, Seth",CS,71.671111,64.846409,6.824702,A
9,660644,"Bruján, Vidal",CS,68.5,64.846409,3.653591,A


In [9]:
# 📦 Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 📁 Load the data
df = pd.read_csv("../data/processed/top_pitchers_by_pitch_type.csv")

# 🎯 Pivot the data for heatmap
heatmap_df = df.pivot_table(
    index="pitch_type", 
    columns="player_name", 
    values="velocity_diff"
)
# 📈 Save the heatmap
plt.figure(figsize=(16, 8))
sns.heatmap(heatmap_df, annot=True, fmt=".1f", cmap="coolwarm", linewidths=0.5, cbar_kws={'label': 'Velocity Diff (mph)'})
plt.title(" Top Pitchers by Pitch Type (Velocity Difference vs League Avg)")
plt.xlabel("Pitcher")
plt.ylabel("Pitch Type")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# ✅ Correct save path
plt.savefig("F:/Portfolio/baseball-pitch-analysis/reports/figures/top_pitchers_by_pitch_type_heatmap.png", dpi=300)
plt.close()




In [13]:
import pandas as pd

# Load a file that includes both pitcher ID and player name
df = pd.read_csv("../data/processed/top_pitchers_by_stadium.csv")

# Extract and deduplicate
player_map = df[['pitcher', 'player_name']].drop_duplicates().sort_values('pitcher')

# Save it
player_map.to_csv("../data/processed/player_id_name.csv", index=False)
print("✅ Saved player ID-to-name map to player_id_name.csv")

# Load velocity grades and rankings with team info
grades_df = pd.read_csv("../data/processed/pitcher_velocity_grades.csv")
team_info = pd.read_csv("../data/processed/pitcher_rankings_by_team.csv")

# Just get pitcher and team info from team_info
pitcher_team_map = team_info[['pitcher', 'team']].drop_duplicates()

# Merge the team into grades
grades_with_team = grades_df.merge(pitcher_team_map, on="pitcher", how="left")

# Check for missing teams
missing_teams = grades_with_team['team'].isna().sum()
print(f"✅ Merged team info — missing teams for {missing_teams} pitchers")

# Save the result if needed
grades_with_team.to_csv("../data/processed/pitcher_velocity_grades_with_team.csv", index=False)


✅ Saved player ID-to-name map to player_id_name.csv
✅ Merged team info — missing teams for 5517 pitchers


In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Load all necessary data
grades = pd.read_csv("../data/processed/pitcher_velocity_grades_with_team.csv")
names = pd.read_csv("../data/processed/player_id_name.csv")
team_map = pd.read_csv("../data/processed/pitcher_rankings_by_team.csv")

# Filter to get Red Sox pitcher IDs for 2025
red_sox_pitchers = team_map[(team_map['team'] == 'BOS') & (team_map['season'] == 2025)]['pitcher'].unique()

# Filter grades to just those pitchers
team_data = grades[(grades['pitcher'].isin(red_sox_pitchers)) & (grades['season'] == 2025)]

# Merge to get player names
team_data = team_data.merge(names, on="pitcher", how="left")

# Pivot to create the heatmap matrix
agg_df = team_data.groupby(["pitch_type", "player_name"])["velocity_diff"].mean().reset_index()
heatmap_df = agg_df.pivot(index="pitch_type", columns="player_name", values="velocity_diff")


# Sort for aesthetics
heatmap_df = heatmap_df.sort_index().sort_index(axis=1)

# Create output directory
output_dir = "../reports/figures/team_heatmaps/"
os.makedirs(output_dir, exist_ok=True)

# Plot
# Define pitch type legend (customizable)
pitch_type_legend = {
    "CH": "Changeup",
    "CU": "Curveball",
    "EP": "Eephus",
    "FA": "Fastball",
    "FC": "Cutter",
    "FF": "Four-Seam Fastball",
    "FS": "Splitter",
    "KC": "Knuckle Curve",
    "PO": "Pitchout",
    "SI": "Sinker",
    "SL": "Slider",
    "ST": "Sweeper",
    "SV": "Slider Variant"
}

# Build legend text
legend_text = "\n".join([f"{k}: {v}" for k, v in pitch_type_legend.items()])

# Plot
plt.figure(figsize=(22, 10))
sns.heatmap(heatmap_df, annot=True, fmt=".1f", cmap="coolwarm", linewidths=0.5,
            cbar_kws={'label': 'Velocity Diff (mph)'})
plt.title(" Red Sox Pitchers by Pitch Type - 2025")
plt.xlabel("Pitcher")
plt.ylabel("Pitch Type")
plt.xticks(rotation=45, ha='right')

# Add pitch type legend as a text box
plt.gcf().text(1.02, 0.5, legend_text, fontsize=10, va='center', ha='left',
               bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5'))

plt.tight_layout()
plt.savefig("F:/Portfolio/baseball-pitch-analysis/reports/figures/BOS_2025_pitcher_heatmap_with_legend.png", dpi=300, bbox_inches='tight')
plt.close()

print("✅ Saved with pitch type legend.")



✅ Saved with pitch type legend.
