In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pybaseball
from pybaseball import statcast
pybaseball.cache.enable()

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [137]:
%%time
# .iloc[::-1].reset_index(drop=True): reverse the row order because the default of row 1 is the most recent instance
# .reset_index().sort_index(ascending=False).reset_index(drop=True) should do the same thing. if not then change it back to the one above
df = statcast(start_dt='2023-03-30',end_dt='2023-10-01').reset_index().sort_index(ascending=False).reset_index(drop=True)
df.shape

This is a large query, it may take a moment to complete


100%|██████████| 186/186 [00:04<00:00, 39.08it/s]


CPU times: user 15.1 s, sys: 7.74 s, total: 22.9 s
Wall time: 18.9 s


(717945, 114)

In [138]:
# check 
check_n_home_game = df.groupby(['home_team'])['game_pk'].nunique()
print(check_n_home_game.unique())
print(f'number of teams in the query: {len(check_n_home_game)}')

[81]
number of teams in the query: 30


In [139]:
# check row order 
print(df[['game_pk','game_date','inning','inning_topbot']].head())
print(df[['game_pk','game_date','inning','inning_topbot']].tail())

   game_pk  game_date  inning inning_topbot
0   718767 2023-03-30       1           Top
1   718767 2023-03-30       1           Top
2   718767 2023-03-30       1           Top
3   718767 2023-03-30       1           Top
4   718767 2023-03-30       1           Top
        game_pk  game_date  inning inning_topbot
717940   716367 2023-10-01       9           Bot
717941   716367 2023-10-01       9           Bot
717942   716367 2023-10-01       9           Bot
717943   716367 2023-10-01       9           Bot
717944   716367 2023-10-01       9           Bot


In [140]:
# list out what kind of data we need for each game
cols = ['home_result','game_pk','date','away_team','home_team','post_away_score','post_home_score','away_starting_pitcher','home_starting_pitcher']
# post away/home score instead of away/home score: considering a case like a walk off hr, it will reflect on the post score instead of the score (which is the score before the hit)

# add batters 1-9 for both away and home
for i in ['away','home']:
    for j in range(1,10):
        cols.append(f'{i}_b{j}')

# colnames for pitcher metrics: 'sp_era','sp_k9','sp_bb','bp_era','bp_k9','bp_bb'
# sp: starting pitcher; bp: bullpen; bb: pitcher's walk rate
# pitcher metrics will be added later
# batter metrics will be added later as well

print(cols[:10])
print(cols[10:20])
print(cols[20:])

['home_result', 'game_pk', 'date', 'away_team', 'home_team', 'post_away_score', 'post_home_score', 'away_starting_pitcher', 'home_starting_pitcher', 'away_b1']
['away_b2', 'away_b3', 'away_b4', 'away_b5', 'away_b6', 'away_b7', 'away_b8', 'away_b9', 'home_b1', 'home_b2']
['home_b3', 'home_b4', 'home_b5', 'home_b6', 'home_b7', 'home_b8', 'home_b9']


In [141]:
data_without_batters = df.groupby('game_pk',sort=False).apply(lambda group: pd.Series({
    'home_result': None,
    'date': group['game_date'].iloc[0],  # only one corresponding value
    'away_team': group['away_team'].iloc[0],  # only one corresponding value
    'home_team': group['home_team'].iloc[0],  # only one corresponding value
    'away_final_score': group['post_away_score'].iloc[-1],
    'home_final_score': group['post_home_score'].iloc[-1],
    'away_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Bot'), 'pitcher'].iloc[0],
    'home_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Top'), 'pitcher'].iloc[0],
})).sort_values(by='date', ascending=True).reset_index()
data_without_batters.head()

  data_without_batters = df.groupby('game_pk',sort=False).apply(lambda group: pd.Series({


Unnamed: 0,game_pk,home_result,date,away_team,home_team,away_final_score,home_final_score,away_starting_pitcher,home_starting_pitcher
0,718767,,2023-03-30,CLE,SEA,0,3,669456,622491
1,718782,,2023-03-30,BAL,BOS,10,9,502043,446372
2,718780,,2023-03-30,ATL,WSH,7,2,608331,571578
3,718779,,2023-03-30,PHI,TEX,7,11,605400,594798
4,718778,,2023-03-30,COL,SD,7,2,608566,605483


In [142]:
data_without_batters['home_result'] = np.where(data_without_batters['home_final_score'] > data_without_batters['away_final_score'], 'W', 'L')
data_without_batters.head()

Unnamed: 0,game_pk,home_result,date,away_team,home_team,away_final_score,home_final_score,away_starting_pitcher,home_starting_pitcher
0,718767,W,2023-03-30,CLE,SEA,0,3,669456,622491
1,718782,L,2023-03-30,BAL,BOS,10,9,502043,446372
2,718780,L,2023-03-30,ATL,WSH,7,2,608331,571578
3,718779,W,2023-03-30,PHI,TEX,7,11,605400,594798
4,718778,L,2023-03-30,COL,SD,7,2,608566,605483


In [143]:
def get_away_batting_order(group):
    away_batters = group.loc[group['inning_topbot'] == 'Top', 'batter'].unique()[:9]
    away_b_dict = {f'away_b{i+1}':away_batters[i] for i in range(9)}
    return pd.Series(away_b_dict)

def get_home_batting_order(group):
    home_batters = group.loc[group['inning_topbot'] == 'Bot', 'batter'].unique()[:9]
    home_b_dict = {f'home_b{i+1}':home_batters[i] for i in range(9)}
    return pd.Series(home_b_dict)


In [145]:
away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)

  away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
  home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)


In [146]:
data_with_batters = data_without_batters.merge(away_bs, on='game_pk').merge(home_bs, on='game_pk')
print(data_with_batters.shape)
data_with_batters.head()

(2430, 27)


Unnamed: 0,game_pk,home_result,date,away_team,home_team,away_final_score,home_final_score,away_starting_pitcher,home_starting_pitcher,away_b1,...,away_b9,home_b1,home_b2,home_b3,home_b4,home_b5,home_b6,home_b7,home_b8,home_b9
0,718767,W,2023-03-30,CLE,SEA,0,3,669456,622491,680757,...,664702,677594,543939,664034,606192,663728,553993,672284,600303,641487
1,718782,L,2023-03-30,BAL,BOS,10,9,502043,446372,656775,...,622761,657077,646240,457759,807799,594807,671213,624414,624512,571771
2,718780,L,2023-03-30,ATL,WSH,7,2,608331,571578,660670,...,606115,657041,608841,600869,642086,660688,669743,671277,645302,682928
3,718779,W,2023-03-30,PHI,TEX,7,11,605400,594798,607208,...,669016,543760,608369,663993,666969,673962,641680,543257,543543,669701
4,718778,L,2023-03-30,COL,SD,7,2,608566,605483,602074,...,678662,663757,665742,592518,593428,630105,572761,543592,673490,621311


In [147]:
data_with_batters.to_csv('data_without_metric.csv', index=True)

---
# METRICS FOR BATTING AND PITCHING

In [148]:
# calculate pitching metric
def calc_era(er, ip):
    '''
    er: earned runs
    ip: innings pitched (normalized, instead of something like .1, .2)
    '''
    return er*9/ip

def calc_k9(k,ip):
    return k*9/ip

def calc_whip(walks, hits, ip):
    return (walks+hits)/ip

# calculate batting metrics
def calc_ops(ab,bb,hbp,single,double,triple,hr,sf):
    '''
    calcluate on base plus slugging, which pretty much includes every batting stats at game level
    ab: at bat 
        - include: 
            - hits: single, double, triple, home_run
            - outs: strikeout, field_out, grounded_into_double_play, double_play, triple_play, strikeout_double_play
            - fielder's choice: fielders_choice, fielders_choice_out
            - error: field_error
            - force out: force_out
        - exclude: 
            - sac_bunt 
            - sac_fly_double_play (but this is counted as a sac fly)
            - catcher_interf
    bb: walk
    hbp: hit_by_pitch
    hr: home_run
    sf: sac_fly, sac_fly_double_play
    * there is an unknown event called 'truncated_pa'. Ignore this for now.
    '''
    h = single+double+triple+hr # hits 
    tb = single + 2*double + 3*triple + 4*hr # total bases
    obp = (h+bb+hbp)/(ab+bb+sf+hbp) # on base percentage
    slg = tb/ab # slugging percentage
    return obp+slg

In [149]:
# according to the metrics above, we'll need the following data:
# starting pitcher: er,k,walks,hits,ip
# any batter: ab,bb,hbp,single,double,triple,hr,sf

## BATTING STATS

In [150]:
# for batters
def get_batting_metrics(group):
    ab_events = ['single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 
                 'grounded_into_double_play', 'double_play', 'triple_play', 'strikeout_double_play',
                 'fielders_choice', 'fielders_choice_out', 'field_error', 'force_out']
    
    ab_count = group[group['events'].isin(ab_events)].shape[0]
    bb_count = group[group['events'] == 'walk'].shape[0]
    hbp_count = group[group['events'] == 'hit_by_pitch'].shape[0]
    single_count = group[group['events'] == 'single'].shape[0]
    double_count = group[group['events'] == 'double'].shape[0]
    triple_count = group[group['events'] == 'triple'].shape[0]
    home_run_count = group[group['events'] == 'home_run'].shape[0]
    sac_fly_count = group[group['events'].isin(['sac_fly', 'sac_fly_double_play'])].shape[0]

    return pd.Series({
        'ab': ab_count,
        'bb': bb_count,
        'hbp': hbp_count,
        'single': single_count,
        'double': double_count,
        'triple': triple_count,
        'hr': home_run_count,
        'sf': sac_fly_count
    })

In [151]:
batting_away = df[df['inning_topbot']=='Top'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
batting_home = df[df['inning_topbot']=='Bot'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)

  batting_away = df[df['inning_topbot']=='Top'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
  batting_home = df[df['inning_topbot']=='Bot'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)


In [182]:
batting_away_lineup = batting_away.groupby('game_pk').head(9)
batting_home_lineup = batting_home.groupby('game_pk').head(9)
# double check
print(batting_away_lineup.shape, batting_home_lineup.shape)
print('81 home games per team * 30 teams * 9 players = 21870 rows')
batting_home_lineup.head(10)

(21870, 8) (21870, 8)
81 home games per team * 30 teams * 9 players = 21870 rows


Unnamed: 0_level_0,Unnamed: 1_level_0,ab,bb,hbp,single,double,triple,hr,sf
game_pk,batter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
718767,677594,4,0,0,0,0,0,0,0
718767,543939,3,0,1,0,0,0,0,0
718767,664034,4,0,0,1,1,0,1,0
718767,606192,4,0,0,0,0,0,0,0
718767,663728,4,0,0,1,0,0,0,0
718767,553993,3,0,0,0,1,0,0,0
718767,672284,3,0,0,1,0,0,0,0
718767,600303,3,0,0,0,0,0,0,0
718767,641487,2,1,0,1,0,0,0,0
718768,665161,4,0,0,1,0,0,0,0


In [164]:
batting_away_lineup.to_csv('batting_data_away.csv', index=True)
batting_home_lineup.to_csv('batting_data_home.csv', index=True)

## PITCHING STATS

In [172]:
# for pitchers
# get stats for ERA(?), WHIP, K9
def get_pitching_metrics(group):
    pitch_count = group.shape[0]
    ip_count = group['inning'].nunique() - (3-group['outs_when_up'].nunique())*0.33
    k_count = group[group['events'].isin(['strikeout', 'strikeout_double_play'])].shape[0]
    hits_count = group[group['events'].isin(['single','double','triple','home_run'])].shape[0]
    bb_count = group[group['events'] == 'walk'].shape[0]

    return pd.Series({
        'np': pitch_count,
        'ip': ip_count,
        'k': k_count,
        'hits': hits_count,
        'bb': bb_count
    })

In [176]:
pitching_away = (df[df['inning_topbot']=='Bot']
                .groupby(['game_pk','pitcher'],sort=False)
                .apply(get_pitching_metrics))
pitching_home = (df[df['inning_topbot']=='Top']
                .groupby(['game_pk','pitcher'],sort=False)
                .apply(get_pitching_metrics))

  .apply(get_pitching_metrics))
  .apply(get_pitching_metrics))


In [178]:
pitching_away.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,np,ip,k,hits,bb
game_pk,pitcher,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
718767,669456,87.0,6.0,3.0,6.0,0.0
718767,663986,12.0,1.0,1.0,0.0,0.0
718767,675916,24.0,0.67,0.0,1.0,1.0
718767,660853,6.0,0.67,1.0,0.0,0.0
718768,656302,86.0,7.0,10.0,2.0,0.0
718768,607481,17.0,0.67,2.0,0.0,1.0
718768,608665,11.0,0.67,0.0,1.0,0.0
718768,625643,27.0,1.0,1.0,1.0,1.0
718769,660271,93.0,6.0,10.0,2.0,3.0
718769,623474,16.0,1.0,1.0,0.0,0.0


In [181]:
starting_pitcher_away = pitching_away.groupby('game_pk').head(1)
starting_pitcher_home = pitching_home.groupby('game_pk').head(1)
# double check
print(starting_pitcher_away.shape, starting_pitcher_home.shape)
print('81 home games per team * 30 teams * 1 player = 2430 rows')
starting_pitcher_home.head()

(2430, 5) (2430, 5)
81 home games per team * 30 teams * 1 player = 2430 rows


Unnamed: 0_level_0,Unnamed: 1_level_0,np,ip,k,hits,bb
game_pk,pitcher,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
718767,622491,76.0,6.0,6.0,1.0,0.0
718768,664285,85.0,5.0,4.0,6.0,0.0
718769,666205,72.0,5.0,3.0,4.0,1.0
718770,628711,79.0,6.0,6.0,4.0,0.0
718772,571945,90.0,4.0,6.0,10.0,0.0


In [183]:
starting_pitcher_away.to_csv('starting_pitcher_data_away.csv', index=True)
starting_pitcher_home.to_csv('starting_pitcher_data_home.csv', index=True)

---
# ARCHIVE/TEST

In [131]:
test0330 = statcast('2023-03-30')

This is a large query, it may take a moment to complete


100%|██████████| 1/1 [00:00<00:00,  3.78it/s]


In [132]:
test0330_away = test0330[test0330['inning_topbot']=='Top'].reset_index().sort_index(ascending=False).reset_index(drop=True)
test0330_home = test0330[test0330['inning_topbot']=='Bot'].reset_index().sort_index(ascending=False).reset_index(drop=True)

In [135]:
batting_0330away = test0330_away.groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
batting_0330home = test0330_home.groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)

  batting_0330away = test0330_away.groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
  batting_0330home = test0330_home.groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)


In [136]:
batting_0330away

Unnamed: 0_level_0,Unnamed: 1_level_0,ab,bb,hbp,single,double,triple,hr,sf
game_pk,batter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
718767,680757,4,0,0,0,0,0,0,0
718767,642708,4,0,0,1,0,0,0,0
718767,608070,4,0,0,0,1,0,0,0
718767,605137,4,0,0,1,0,0,0,0
718767,647304,3,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
718782,683002,3,2,0,0,0,0,0,0
718782,602104,4,0,1,1,0,0,1,0
718782,669720,5,0,0,0,0,0,0,0
718782,624428,4,1,0,0,2,0,0,0


In [124]:
only_dodgers = finalg1dodgeers[finalg1dodgeers['inning_topbot']=='Bot'].reset_index().sort_index(ascending=False).reset_index(drop=True)
only_dodgers[['away_score','home_score','post_away_score','post_home_score','inning','inning_topbot',
            'outs_when_up','at_bat_number','batter','pitch_number','pitcher','events','description']]

Unnamed: 0,away_score,home_score,post_away_score,post_home_score,inning,inning_topbot,outs_when_up,at_bat_number,batter,pitch_number,pitcher,events,description
0,0,0,0,0,1,Bot,0,6,660271,1,543037,field_out,hit_into_play
1,0,0,0,0,1,Bot,1,7,605141,1,543037,,called_strike
2,0,0,0,0,1,Bot,1,7,605141,2,543037,,foul
3,0,0,0,0,1,Bot,1,7,605141,3,543037,field_out,hit_into_play
4,0,0,0,0,1,Bot,2,8,518692,1,543037,,foul
...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,3,2,3,2,10,Bot,1,80,669242,2,664776,,called_strike
132,3,2,3,2,10,Bot,1,80,669242,3,664776,,ball
133,3,2,3,2,10,Bot,1,80,669242,4,664776,single,hit_into_play
134,3,2,3,2,10,Bot,1,81,660271,1,641482,field_out,hit_into_play


In [130]:
batting_dodgers = only_dodgers.groupby('batter',sort=False).apply(get_batting_metrics)
batting_dodgers

  batting_dodgers = only_dodgers.groupby('batter',sort=False).apply(get_batting_metrics)


Unnamed: 0_level_0,ab,bb,hbp,single,double,triple,hr,sf
batter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
660271,5,0,0,0,1,0,0,0
605141,3,0,0,0,0,0,0,1
518692,5,0,0,0,0,1,1,0
606192,4,0,0,1,0,0,0,0
571970,3,0,1,0,0,0,0,0
571771,3,0,0,0,0,1,0,0
669257,3,0,0,0,0,0,0,1
666158,3,1,0,0,0,0,0,0
669242,4,0,0,1,1,0,0,0


In [114]:
away_batting_pivoted.head()

Unnamed: 0_level_0,408234_ab,443558_ab,444482_ab,444489_ab,446334_ab,453568_ab,455117_ab,456781_ab,457705_ab,457759_ab,...,691406_sf,691718_sf,691783_sf,693049_sf,693304_sf,694384_sf,694497_sf,696100_sf,696285_sf,807799_sf
game_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
716352,,,,,,,,,,,...,,,,,,,,,,
716353,,,,,,,,,,,...,,,,,,,,,,
716354,,,,,,,,,,,...,,,,,,,,,,
716355,,,,,,,,,,,...,,,,,,,,,,
716356,,,,,,,,,,,...,,,,,,,,,,


In [122]:
finalg1dodgeers = statcast('2024-10-25')

This is a large query, it may take a moment to complete


100%|██████████| 1/1 [00:00<00:00,  5.14it/s]


In [98]:
clean_dodgers = finalg1dodgeers.reset_index().sort_index(ascending=False).reset_index(drop=True)[['away_score','home_score','post_away_score','post_home_score','inning','inning_topbot',
                                                                                  'outs_when_up','at_bat_number','batter','pitch_number','pitcher','events','description']]
clean_dodgers.head(15)

Unnamed: 0,away_score,home_score,post_away_score,post_home_score,inning,inning_topbot,outs_when_up,at_bat_number,batter,pitch_number,pitcher,events,description
0,0,0,0,0,1,Top,0,1,650402,1,656427,,ball
1,0,0,0,0,1,Top,0,1,650402,2,656427,,foul
2,0,0,0,0,1,Top,0,1,650402,3,656427,,ball
3,0,0,0,0,1,Top,0,1,650402,4,656427,field_out,hit_into_play
4,0,0,0,0,1,Top,1,2,665742,1,656427,,ball
5,0,0,0,0,1,Top,1,2,665742,2,656427,,ball
6,0,0,0,0,1,Top,1,2,665742,3,656427,,ball
7,0,0,0,0,1,Top,1,2,665742,4,656427,walk,ball
8,0,0,0,0,1,Top,1,3,592450,1,656427,,called_strike
9,0,0,0,0,1,Top,1,3,592450,2,656427,,swinging_strike


In [99]:
clean_dodgers.tail(15)

Unnamed: 0,away_score,home_score,post_away_score,post_home_score,inning,inning_topbot,outs_when_up,at_bat_number,batter,pitch_number,pitcher,events,description
292,3,2,3,2,10,Bot,0,78,669257,2,664776,,ball
293,3,2,3,2,10,Bot,0,78,669257,3,664776,,ball
294,3,2,3,2,10,Bot,0,78,669257,4,664776,,called_strike
295,3,2,3,2,10,Bot,0,78,669257,5,664776,field_out,hit_into_play
296,3,2,3,2,10,Bot,1,79,666158,1,664776,,ball
297,3,2,3,2,10,Bot,1,79,666158,2,664776,,ball
298,3,2,3,2,10,Bot,1,79,666158,3,664776,,called_strike
299,3,2,3,2,10,Bot,1,79,666158,4,664776,,ball
300,3,2,3,2,10,Bot,1,79,666158,5,664776,walk,ball
301,3,2,3,2,10,Bot,1,80,669242,1,664776,,called_strike


In [101]:
df['events'].unique()

array([None, 'field_out', 'strikeout', 'single', 'double',
       'grounded_into_double_play', 'walk', 'hit_by_pitch', 'home_run',
       'force_out', 'fielders_choice_out', 'catcher_interf',
       'field_error', 'sac_fly', 'triple', 'sac_bunt',
       'strikeout_double_play', 'sac_fly_double_play', 'fielders_choice',
       'double_play', 'truncated_pa', 'triple_play'], dtype=object)

In [171]:
clean_dodgers['inning'].nunique()

10