In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pybaseball
from pybaseball import statcast
pybaseball.cache.enable()

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [17]:
%%time
# .iloc[::-1].reset_index(drop=True): reverse the row order because the default of row 1 is the most recent instance
df = statcast(start_dt='2023-03-30',end_dt='2023-10-01').iloc[::-1].reset_index(drop=True)
df.shape

This is a large query, it may take a moment to complete


100%|██████████| 186/186 [00:04<00:00, 38.92it/s]


CPU times: user 14.4 s, sys: 6.35 s, total: 20.8 s
Wall time: 15.8 s


(717945, 113)

In [22]:
# check 
check_n_home_game = df.groupby(['home_team'])['game_pk'].nunique()
print(check_n_home_game.unique())
print(f'number of teams in the query: {len(check_n_home_game)}')

[81]
number of teams in the query: 30


In [30]:
# check row order 
print(df[['game_pk','game_date','inning','inning_topbot']].head())
print(df[['game_pk','game_date','inning','inning_topbot']].tail())

   game_pk  game_date  inning inning_topbot
0   718767 2023-03-30       1           Top
1   718767 2023-03-30       1           Top
2   718767 2023-03-30       1           Top
3   718767 2023-03-30       1           Top
4   718767 2023-03-30       1           Top
        game_pk  game_date  inning inning_topbot
717940   716367 2023-10-01       9           Bot
717941   716367 2023-10-01       9           Bot
717942   716367 2023-10-01       9           Bot
717943   716367 2023-10-01       9           Bot
717944   716367 2023-10-01       9           Bot


In [45]:
# list out what kind of data we need for each game
cols = ['home_Win','game_pk','date','away_team','home_team','away_starting_pitcher','home_starting_pitcher']

# add batters 1-9 for both away and home
for i in ['away','home']:
    for j in range(1,10):
        cols.append(f'{i}_b{j}')

# colnames for pitcher metrics: 'sp_era','sp_k9','sp_bb','bp_era','bp_k9','bp_bb'
# sp: starting pitcher; bp: bullpen; bb: pitcher's walk rate
# pitcher metrics will be added later
# batter metrics will be added later as well

print(cols[:10])
print(cols[10:20])
print(cols[20:])

['home_Win', 'game_pk', 'date', 'away_team', 'home_team', 'away_starting_pitcher', 'home_starting_pitcher', 'away_b1', 'away_b2', 'away_b3']
['away_b4', 'away_b5', 'away_b6', 'away_b7', 'away_b8', 'away_b9', 'home_b1', 'home_b2', 'home_b3', 'home_b4']
['home_b5', 'home_b6', 'home_b7', 'home_b8', 'home_b9']


In [54]:
data_without_batters = df.groupby('game_pk').apply(lambda group: pd.Series({
    'date': group['game_date'].iloc[0],  # only one corresponding value
    'away_team': group['away_team'].iloc[0],  # only one corresponding value
    'home_team': group['home_team'].iloc[0],  # only one corresponding value
    'away_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Bot'), 'pitcher'].iloc[0],
    'home_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Top'), 'pitcher'].iloc[0],

})).sort_values(by='date', ascending=True).reset_index()
data_without_batters.head()

  data_without_batters = df.groupby('game_pk').apply(lambda group: pd.Series({


Unnamed: 0,game_pk,date,away_team,home_team,away_starting_pitcher,home_starting_pitcher
0,718782,2023-03-30,BAL,BOS,502043,446372
1,718781,2023-03-30,SF,NYY,657277,543037
2,718767,2023-03-30,CLE,SEA,669456,622491
3,718768,2023-03-30,CWS,HOU,656302,664285
4,718770,2023-03-30,AZ,LAD,668678,628711


In [47]:
def get_away_batting_order(group):
    away_batters = group.loc[group['inning_topbot'] == 'Top', 'batter'].unique()[:9]
    away_b_dict = {f'away_b{i+1}':away_batters[i] for i in range(9)}
    return pd.Series(away_b_dict)

def get_home_batting_order(group):
    home_batters = group.loc[group['inning_topbot'] == 'Bot', 'batter'].unique()[:9]
    home_b_dict = {f'home_b{i+1}':home_batters[i] for i in range(9)}
    return pd.Series(home_b_dict)


In [52]:
away_bs = df.groupby('game_pk').apply(get_away_batting_order)
home_bs = df.groupby('game_pk').apply(get_home_batting_order)

  away_bs = df.groupby('game_pk').apply(get_away_batting_order)
  home_bs = df.groupby('game_pk').apply(get_home_batting_order)


In [56]:
data_with_batters = data_without_batters.merge(away_bs, on='game_pk').merge(home_bs, on='game_pk')
data_with_batters.head()
print(data_with_batters.shape)

(2430, 24)


In [57]:
data_with_batters.to_csv('data_without_metric.csv', index=True)