In [1]:
# 📦 Imports
import pandas as pd
import glob

# 📁 Load all data files
files = sorted(glob.glob("../data/raw/statcast_*.csv"))

dfs = [pd.read_csv(file, low_memory=False) for file in files]
df = pd.concat(dfs, ignore_index=True)

# 🧼 Clean up: drop any rows missing pitch_type or pitcher/batter
df = df.dropna(subset=['pitch_type', 'pitcher', 'batter'])

# 📊 Basic ordering
df = df.sort_values(by=['game_date', 'pitcher', 'batter', 'inning', 'at_bat_number', 'pitch_number'])

# 🧠 Add unique at-bat identifier
df['at_bat_id'] = (
    df['game_date'].astype(str) + "_" +
    df['pitcher'].astype(str) + "_" +
    df['batter'].astype(str) + "_" +
    df['inning'].astype(str) + "_" +
    df['at_bat_number'].astype(str)
)

# 📈 Group to extract sequences of pitch types per at-bat
sequences = df.groupby('at_bat_id')['pitch_type'].apply(list).reset_index()
sequences.columns = ['at_bat_id', 'pitch_sequence']

# 🪪 Optionally extract pitcher and batter info
meta = df.groupby('at_bat_id').agg({
    'pitcher': 'first',
    'batter': 'first',
    'game_date': 'first'
}).reset_index()

sequences = sequences.merge(meta, on='at_bat_id')

# 💾 Save output
sequences.to_csv("../data/processed/pitch_sequences.csv", index=False)
print("✅ Saved to ../data/processed/pitch_sequences.csv")

# 🔍 Preview
sequences.head()


✅ Saved to ../data/processed/pitch_sequences.csv


Unnamed: 0,at_bat_id,pitch_sequence,pitcher,batter,game_date
0,2022-04-07_424144_543333_6_42,"[SI, SI, SL]",424144,543333,2022-04-07
1,2022-04-07_424144_595777_6_43,"[SI, SI, FF, SI, SI]",424144,595777,2022-04-07
2,2022-04-07_424144_663757_7_49,"[SL, SL, SL, SL, SL]",424144,663757,2022-04-07
3,2022-04-07_424144_673490_6_44,"[SL, SI, SL, SI]",424144,673490,2022-04-07
4,2022-04-07_425794_547379_3_21,"[SI, SI, CU, SI, SI]",425794,547379,2022-04-07
