In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pybaseball
from pybaseball import statcast
pybaseball.cache.enable()

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
# Get pitch level data
def collect_pitch_level_data(start, end):
    df = statcast(start_dt= start,end_dt= end).reset_index().sort_index(ascending=False).reset_index(drop=True)
    return df



Collect pitch level and convert to game level data

In [3]:
# Convert pitch level to game level
def get_away_batting_order(group):
    away_batters = group.loc[group['inning_topbot'] == 'Top', 'batter'].unique()[:9]
    away_b_dict = {f'away_b{i+1}':away_batters[i] for i in range(9)}
    return pd.Series(away_b_dict)

def get_home_batting_order(group):
    home_batters = group.loc[group['inning_topbot'] == 'Bot', 'batter'].unique()[:9]
    home_b_dict = {f'home_b{i+1}':home_batters[i] for i in range(9)}
    return pd.Series(home_b_dict)
def group_by_game(df):
    data_without_batters = df.groupby('game_pk',sort=False).apply(lambda group: pd.Series({
    'home_result': None,
    'date': group['game_date'].iloc[0],  # only one corresponding value
    'away_team': group['away_team'].iloc[0],  # only one corresponding value
    'home_team': group['home_team'].iloc[0],  # only one corresponding value
    'away_final_score': group['post_away_score'].iloc[-1],
    'home_final_score': group['post_home_score'].iloc[-1],
    'away_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Bot'), 'pitcher'].iloc[0],
    'home_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Top'), 'pitcher'].iloc[0],
    })).sort_values(by='date', ascending=True).reset_index()
    
    data_without_batters['home_result'] = np.where(data_without_batters['home_final_score'] > data_without_batters['away_final_score'], 'W', 'L')
    
    away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
    home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)

    data_with_batters = data_without_batters.merge(away_bs, on='game_pk').merge(home_bs, on='game_pk')
    
    data_without_batters['home_result'] = np.where(data_without_batters['home_final_score'] > data_without_batters['away_final_score'], 'W', 'L')

    away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
    home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)

    data_with_batters = data_without_batters.merge(away_bs, on='game_pk').merge(home_bs, on='game_pk')

    return data_with_batters

Add batter stats to each game


In [4]:
def get_player_game_batting(df, filename = "", savefile = False):
    batting_away = df[df['inning_topbot']=='Top'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
    batting_home = df[df['inning_topbot']=='Bot'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
    
    batting_away_lineup = batting_away.groupby('game_pk').head(9)
    batting_home_lineup = batting_home.groupby('game_pk').head(9)

    if savefile:
        batting_away_lineup.to_csv(filename + "_away.csv", index=True)
        batting_home_lineup.to_csv(filename + "_home.csv", index=True)

    return batting_home_lineup, batting_away_lineup

In [5]:
def add_bat_stats_to_games(data, batting_stats, batter_stats = ["ab", "bb", "hbp", "single", "double", "triple", "hr", "sf"]):
    
    batters = ["away_b1", "away_b2", "away_b3", "away_b4", "away_b5", "away_b6", "away_b7", "away_b8", "away_b9", "home_b1", "home_b2", "home_b3", "home_b4", "home_b5", "home_b6", "home_b7", "home_b8", "home_b9"]
    batter_stats.append("batter")

    # Append stats for every batter and clean
    for batter in batters:
        data = data.merge(batting_stats, how="left", left_on= ["game_pk",batter], right_on=["game_pk","batter"], suffixes=("","_"+batter))
    
    data = data.rename(columns = lambda col: col + "_away_b1" if col in batter_stats else col)

    for batter in batters:
        data = data.drop(columns=["batter_" + batter])

    home_stats, away_stats = data.copy(), data.copy()
    home_stats = home_stats.loc[:, ~home_stats.columns.str.contains("away")]
    away_stats = away_stats.loc[:, ~away_stats.columns.str.contains("home")]

    home_stats.drop(columns = ['home_result', 'home_final_score'], inplace = True)
    away_stats.drop(columns = ['away_final_score'], inplace = True)

    for i in range(9):
        home_stats.drop(columns=f"home_b{i+1}", inplace=True)
        away_stats.drop(columns=f"away_b{i+1}", inplace=True)

    home_stats = home_stats.rename(
    columns={col: col.replace("home", "") for col in home_stats.columns if "home" in col}
)
    away_stats = away_stats.rename(
    columns={col: col.replace("away", "") for col in away_stats.columns if "away" in col}
)
    
    team_game_stats = pd.concat([home_stats, away_stats])
    team_game_stats.rename(columns = {"_team": "team"}, inplace = True)
    
    return team_game_stats

Determine previous games

In [6]:
def get_previous_n_games(results, n, filename = "", save_file = False):
    #home_teams, away_teams = results.copy(), results.copy()


    #home_teams["team"] = home_teams["home_team"]
    #away_teams["team"] = away_teams["away_team"]
    games = results[["game_pk","date","team"]]

    games["date"] = pd.to_datetime(games["date"])

    games.sort_values(by = ["team","date"], inplace = True, axis = 0, ascending = [True, True])
    prev_col = "game_pk"

    for i in range(n):
        column_name = f"prev_{i+1}_game_pk"
        games[column_name] = games.groupby('team')[prev_col].shift()
        prev_col = column_name
    
    if save_file:
        games.to_csv(filename, index = False)
    
    return games

Add previous game stats to games


In [45]:
def add_prev_game_stats(games, prev_games, N, batter_games_stats):
    games = games.merge(prev_games, how="left", left_on=["game_pk","home_team"], right_on=["game_pk","team"], suffixes=("", "_home"))
    games = games.merge(prev_games, how="left", left_on=["game_pk","away_team"], right_on=["game_pk","team"], suffixes=("", "_away"))
    for i in range(N):
        home_team_col = f"prev_{i+1}_game_pk"
        away_team_col = f"prev_{i+1}_game_pk_away"
        # Add home_team stats
        games = games.merge(batter_games_stats, how="left", left_on= [home_team_col , "home_team"], right_on= ["game_pk", "team"], suffixes=("",f"_{i+1}"))
        # Add away_team stats
        games = games.merge(batter_games_stats, how="left", left_on= [away_team_col , "away_team"], right_on= ["game_pk", "team"], suffixes=("",f"_away_{i+1}"))
    games = games.drop(columns = ["team","away_b1", "away_b2", "away_b3", "away_b4", "away_b5", "away_b6", "away_b7", "away_b8", "away_b9", "home_b1", "home_b2", "home_b3", "home_b4", "home_b5", "home_b6", "home_b7", "home_b8", "home_b9", "date_home", "date_away","prev_1_game_pk", "prev_2_game_pk", "prev_3_game_pk", "prev_4_game_pk", "prev_5_game_pk", "prev_6_game_pk", "prev_7_game_pk", "prev_8_game_pk", "prev_9_game_pk", "prev_10_game_pk", "date_away", "team_away", "prev_1_game_pk_away", "prev_2_game_pk_away", "prev_3_game_pk_away", "prev_4_game_pk_away", "prev_5_game_pk_away", "prev_6_game_pk_away", "prev_7_game_pk_away", "prev_8_game_pk_away", "prev_9_game_pk_away", "prev_10_game_pk_away"])
        
    for i in range(N):
        games = games.drop(columns = [f"date_{i+1}", f"team_{i+1}",f"game_pk_{i+1}",f"date_away_{i+1}", f"team_away_{i+1}",f"game_pk_away_{i+1}"])
    return games

In [None]:
N = 10
# Get pitch level data
df = collect_pitch_level_data(start= '2023-03-30',end= '2023-10-01')

# Group into games
games = group_by_game(df)

# Get batter stats per game
batting_home_lineup = pd.read_csv("batting_data_home.csv")
batting_away_lineup = pd.read_csv("batting_data_away.csv")
batter_stats = pd.concat([batting_home_lineup, batting_away_lineup])
#batting_home_lineup, batting_away_lineup = get_player_game_batting(df, savefile = False)

games_with_batting_stats = add_bat_stats_to_games(games, batter_stats)

prev_games = get_previous_n_games(games_with_batting_stats, N)

games_with_prev_stats = add_prev_game_stats(games, prev_games, N, games_with_batting_stats)


In [43]:
games_with_batting_stats = add_bat_stats_to_games(games, batter_stats)
prev_games = get_previous_n_games(games_with_batting_stats, N)
games_with_prev_stats = add_prev_game_stats(games, prev_games, N, games_with_batting_stats)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games["date"] = pd.to_datetime(games["date"])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games.sort_values(by = ["team","date"], inplace = True, axis = 0, ascending = [True, True])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games[column_name] = games.groupby('team')[prev_col].shift()
A value is trying to be set on a copy of a slice from a DataFra

['game_pk', 'home_result', 'date', 'away_team', 'home_team', 'away_final_score', 'home_final_score', 'away_starting_pitcher', 'home_starting_pitcher', 'away_b1', 'away_b2', 'away_b3', 'away_b4', 'away_b5', 'away_b6', 'away_b7', 'away_b8', 'away_b9', 'home_b1', 'home_b2', 'home_b3', 'home_b4', 'home_b5', 'home_b6', 'home_b7', 'home_b8', 'home_b9', 'date_home', 'team', 'prev_1_game_pk', 'prev_2_game_pk', 'prev_3_game_pk', 'prev_4_game_pk', 'prev_5_game_pk', 'prev_6_game_pk', 'prev_7_game_pk', 'prev_8_game_pk', 'prev_9_game_pk', 'prev_10_game_pk', 'date_away', 'team_away', 'prev_1_game_pk_away', 'prev_2_game_pk_away', 'prev_3_game_pk_away', 'prev_4_game_pk_away', 'prev_5_game_pk_away', 'prev_6_game_pk_away', 'prev_7_game_pk_away', 'prev_8_game_pk_away', 'prev_9_game_pk_away', 'prev_10_game_pk_away']
['game_pk', 'home_result', 'date', 'away_team', 'home_team', 'away_final_score', 'home_final_score', 'away_starting_pitcher', 'home_starting_pitcher', 'away_b1', 'away_b2', 'away_b3', 'away_b4

In [44]:
games_with_prev_stats.to_csv("games_with_prev_stats.csv", index = False)


# ARCHIVE

Full pitch level data

In [2]:
def collect_pitch_level_data(start, end):
    df = statcast(start_dt= start,end_dt= end).reset_index().sort_index(ascending=False).reset_index(drop=True)
    return df

In [3]:
df = collect_pitch_level_data(start= '2023-03-30',end= '2023-10-01')

This is a large query, it may take a moment to complete


100%|██████████| 186/186 [00:17<00:00, 10.82it/s]


Group pitches by game

In [7]:
def get_away_batting_order(group):
    away_batters = group.loc[group['inning_topbot'] == 'Top', 'batter'].unique()[:9]
    away_b_dict = {f'away_b{i+1}':away_batters[i] for i in range(9)}
    return pd.Series(away_b_dict)

def get_home_batting_order(group):
    home_batters = group.loc[group['inning_topbot'] == 'Bot', 'batter'].unique()[:9]
    home_b_dict = {f'home_b{i+1}':home_batters[i] for i in range(9)}
    return pd.Series(home_b_dict)


def group_by_game(df):
    data_without_batters = df.groupby('game_pk',sort=False).apply(lambda group: pd.Series({
    'home_result': None,
    'date': group['game_date'].iloc[0],  # only one corresponding value
    'away_team': group['away_team'].iloc[0],  # only one corresponding value
    'home_team': group['home_team'].iloc[0],  # only one corresponding value
    'away_final_score': group['post_away_score'].iloc[-1],
    'home_final_score': group['post_home_score'].iloc[-1],
    'away_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Bot'), 'pitcher'].iloc[0],
    'home_starting_pitcher': group.loc[(group['inning'] == 1) & (group['inning_topbot'] == 'Top'), 'pitcher'].iloc[0],
    })).sort_values(by='date', ascending=True).reset_index()
    
    data_without_batters['home_result'] = np.where(data_without_batters['home_final_score'] > data_without_batters['away_final_score'], 'W', 'L')
    
    away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
    home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)

    data_with_batters = data_without_batters.merge(away_bs, on='game_pk').merge(home_bs, on='game_pk')
    
    data_without_batters['home_result'] = np.where(data_without_batters['home_final_score'] > data_without_batters['away_final_score'], 'W', 'L')

    away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
    home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)

    data_with_batters = data_without_batters.merge(away_bs, on='game_pk').merge(home_bs, on='game_pk')

    return data_with_batters


In [8]:
games = group_by_game(df)


  data_without_batters = df.groupby('game_pk',sort=False).apply(lambda group: pd.Series({
  away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
  home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)
  away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
  home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)


In [74]:
games.head()
cols = ["game_pk","date", "away_team", "away_final_score","home_final_score","home_team", "home_result","away_starting_pitcher",	"home_starting_pitcher"]
games_training_data = games[cols].copy()
games_training_data.head()

Unnamed: 0,game_pk,date,away_team,away_final_score,home_final_score,home_team,home_result,away_starting_pitcher,home_starting_pitcher
0,718767,2023-03-30,CLE,0,3,SEA,W,669456,622491
1,718782,2023-03-30,BAL,10,9,BOS,L,502043,446372
2,718780,2023-03-30,ATL,7,2,WSH,L,608331,571578
3,718779,2023-03-30,PHI,7,11,TEX,W,605400,594798
4,718778,2023-03-30,COL,7,2,SD,L,608566,605483


In [9]:
def get_batting_metrics(group):
    ab_events = ['single', 'double', 'triple', 'home_run', 'strikeout', 'field_out', 
                'grounded_into_double_play', 'double_play', 'triple_play', 'strikeout_double_play',
                'fielders_choice', 'fielders_choice_out', 'field_error', 'force_out']
    
    ab_count = group[group['events'].isin(ab_events)].shape[0]
    bb_count = group[group['events'] == 'walk'].shape[0]
    hbp_count = group[group['events'] == 'hit_by_pitch'].shape[0]
    single_count = group[group['events'] == 'single'].shape[0]
    double_count = group[group['events'] == 'double'].shape[0]
    triple_count = group[group['events'] == 'triple'].shape[0]
    home_run_count = group[group['events'] == 'home_run'].shape[0]
    sac_fly_count = group[group['events'].isin(['sac_fly', 'sac_fly_double_play'])].shape[0]

    return pd.Series({
        'ab': ab_count,
        'bb': bb_count,
        'hbp': hbp_count,
        'single': single_count,
        'double': double_count,
        'triple': triple_count,
        'hr': home_run_count,
        'sf': sac_fly_count
    })

def get_player_game_batting(df, filename = "", savefile = False):
    batting_away = df[df['inning_topbot']=='Top'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
    batting_home = df[df['inning_topbot']=='Bot'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
    
    batting_away_lineup = batting_away.groupby('game_pk').head(9)
    batting_home_lineup = batting_home.groupby('game_pk').head(9)

    if savefile:
        batting_away_lineup.to_csv(filename + "_away.csv", index=True)
        batting_home_lineup.to_csv(filename + "_home.csv", index=True)

    return batting_home_lineup, batting_away_lineup

In [10]:
batting_home_lineup, batting_away_lineup = get_player_game_batting(df, savefile = False)

  batting_away = df[df['inning_topbot']=='Top'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)
  batting_home = df[df['inning_topbot']=='Bot'].groupby(['game_pk','batter'],sort=False).apply(get_batting_metrics)


In [90]:
def get_previous_n_games(results, n, filename = "", save_file = False):
    #results = pd.read_csv("data_with_batter_stats.csv")
    home_teams, away_teams = results.copy(), results.copy()


    home_teams["team"] = home_teams["home_team"]
    away_teams["team"] = away_teams["away_team"]

    games = pd.concat([home_teams[["game_pk","date","team"]], away_teams[["game_pk","date","team"]]])

    games["date"] = pd.to_datetime(games["date"])

    games.sort_values(by = ["team","date"], inplace = True, axis = 0, ascending = [True, True])
    prev_col = "game_pk"

    for i in range(n):
        column_name = f"prev_{i+1}_game_pk"
        games[column_name] = games.groupby('team')[prev_col].shift()
        prev_col = column_name
    
    if save_file:
        games.to_csv(filename, index = False)
    
    return games

In [89]:
def add_prev_game_stats(games, N, game_batter_results):
    '''
    Adds previous game statistics to each game record by merging with historical game data.
    
    Args:
        games (pd.DataFrame): DataFrame containing game records with previous game IDs
        N (int): Number of previous games to include
        game_batter_results (pd.DataFrame): DataFrame containing batter statistics for each game
        
    Returns:
        pd.DataFrame: Original games DataFrame with added columns for previous game statistics
    '''

    for i in range(N):
        game_id_col = f"prev_{i+1}_game_pk"
        # Check this one
        games = games.merge(game_batter_results, how="left", left_on= [game_id_col,"home_team"] , right_on= ["game_pk", "team"], suffixes=("",f"_{i}"))
    return games

In [127]:
def get_game_batter_stats(data, historical_game_data, N = 10, batter_stats = ["ab", "bb", "hbp", "single", "double", "triple", "hr", "sf"]):
    
    data = data.merge(historical_game_data, how="left", left_on=["game_pk","home_team"], right_on=["game_pk","team"])
    data = data.merge(historical_game_data, how="left", left_on=["game_pk","away_team"], right_on=["game_pk","team"])
     # Append stats for every batter and clean
    for i in range(N):
        game_id_col = f"prev_{i+1}_game_pk"
        # Home team stats
        data = data.merge(data, how="left", left_on= [game_id_col,"home_team"], right_on=["game_pk","team"], suffixes=("","_"+i))
        data = data.merge(data, how="left", left_on= [game_id_col,"away_team"], right_on=["game_pk","team"], suffixes=("","_"+i))

        #Away team stats
    
    #cols = ["away_final_score", "home_final_score", "away_starting_pitcher", "home_starting_pitcher", "away_b1", "away_b2", "away_b3", "away_b4", "away_b5", "away_b6", "away_b7", "away_b8", "away_b9", "home_b1", "home_b2", "home_b3", "home_b4", "home_b5", "home_b6", "home_b7", "home_b8", "home_b9"]
    #data.drop(columns=cols, inplace=True)
    #data = data.rename(columns = lambda col: col + "_away_b1" if col in batter_stats else col)

    #home_stats = data[["game_pk", "home_team", "ab_home_b1", "bb_home_b1", "hbp_home_b1", "single_home_b1", "double_home_b1", "triple_home_b1", "hr_home_b1", "sf_home_b1", "ab_home_b2", "bb_home_b2", "hbp_home_b2", "single_home_b2", "double_home_b2", "triple_home_b2", "hr_home_b2", "sf_home_b2", "ab_home_b3", "bb_home_b3", "hbp_home_b3", "single_home_b3", "double_home_b3", "triple_home_b3", "hr_home_b3", "sf_home_b3", "ab_home_b4", "bb_home_b4", "hbp_home_b4", "single_home_b4", "double_home_b4", "triple_home_b4", "hr_home_b4", "sf_home_b4", "ab_home_b5", "bb_home_b5", "hbp_home_b5", "single_home_b5", "double_home_b5", "triple_home_b5", "hr_home_b5", "sf_home_b5", "ab_home_b6", "bb_home_b6", "hbp_home_b6", "single_home_b6", "double_home_b6", "triple_home_b6", "hr_home_b6", "sf_home_b6", "ab_home_b7", "bb_home_b7", "hbp_home_b7", "single_home_b7", "double_home_b7", "triple_home_b7", "hr_home_b7", "sf_home_b7", "ab_home_b8", "bb_home_b8", "hbp_home_b8", "single_home_b8", "double_home_b8", "triple_home_b8", "hr_home_b8", "sf_home_b8", "ab_home_b9", "bb_home_b9", "hbp_home_b9", "single_home_b9", "double_home_b9", "triple_home_b9", "hr_home_b9", "sf_home_b9"]]
    
    #away_stats = data[["game_pk", "home_result", "date", "ab_away_b1", "bb_away_b1", "hbp_away_b1", "single_away_b1", "double_away_b1", "triple_away_b1", "hr_away_b1", "sf_away_b1", "ab_away_b2", "bb_away_b2", "hbp_away_b2", "single_away_b2", "double_away_b2", "triple_away_b2", "hr_away_b2", "sf_away_b2", "ab_away_b3", "bb_away_b3", "hbp_away_b3", "single_away_b3", "double_away_b3", "triple_away_b3", "hr_away_b3", "sf_away_b3", "ab_away_b4", "bb_away_b4", "hbp_away_b4", "single_away_b4", "double_away_b4", "triple_away_b4", "hr_away_b4", "sf_away_b4", "ab_away_b5", "bb_away_b5", "hbp_away_b5", "single_away_b5", "double_away_b5", "triple_away_b5", "hr_away_b5", "sf_away_b5", "ab_away_b6", "bb_away_b6", "hbp_away_b6", "single_away_b6", "double_away_b6", "triple_away_b6", "hr_away_b6", "sf_away_b6", "ab_away_b7", "bb_away_b7", "hbp_away_b7", "single_away_b7", "double_away_b7", "triple_away_b7", "hr_away_b7", "sf_away_b7", "ab_away_b8", "bb_away_b8", "hbp_away_b8", "single_away_b8", "double_away_b8", "triple_away_b8", "hr_away_b8", "sf_away_b8", "ab_away_b9", "bb_away_b9", "hbp_away_b9", "single_away_b9", "double_away_b9", "triple_away_b9", "hr_away_b9", "sf_away_b9"]]
    return data

In [135]:

game_batter_results = get_game_batter_stats(games, batter_stats)

KeyError: 'team'

In [129]:
games_history = get_previous_n_games(game_with_batter_results,10,save_file = False)


# add batter stats for each game
training_data = get_game_batter_stats(game_with_batter_results,games_history)
game_with_batter_results.shape()

MergeError: Passing 'suffixes' which cause duplicate columns {'date_x'} is not allowed.

In [83]:
# Fix batter stats
batter_stats.loc[718767]

Unnamed: 0_level_0,ab,bb,hbp,single,double,triple,hr,sf
batter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
677594,4,0,0,0,0,0,0,0
543939,3,0,1,0,0,0,0,0
664034,4,0,0,1,1,0,1,0
606192,4,0,0,0,0,0,0,0
663728,4,0,0,1,0,0,0,0
553993,3,0,0,0,1,0,0,0
672284,3,0,0,1,0,0,0,0
600303,3,0,0,0,0,0,0,0
641487,2,1,0,1,0,0,0,0
680757,4,0,0,0,0,0,0,0


In [79]:
#training_data = add_prev_game_stats(games, 10, game_batter_results)
batter_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ab,bb,hbp,single,double,triple,hr,sf
game_pk,batter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
718767,677594,4,0,0,0,0,0,0,0
718767,543939,3,0,1,0,0,0,0,0
718767,664034,4,0,0,1,1,0,1,0
718767,606192,4,0,0,0,0,0,0,0
718767,663728,4,0,0,1,0,0,0,0


In [25]:
games.head()

Unnamed: 0,game_pk,home_result,date,away_team,home_team,away_final_score,home_final_score,away_starting_pitcher,home_starting_pitcher,away_b1,...,away_b9,home_b1,home_b2,home_b3,home_b4,home_b5,home_b6,home_b7,home_b8,home_b9
0,718767,W,2023-03-30,CLE,SEA,0,3,669456,622491,680757,...,664702,677594,543939,664034,606192,663728,553993,672284,600303,641487
1,718782,L,2023-03-30,BAL,BOS,10,9,502043,446372,656775,...,622761,657077,646240,457759,807799,594807,671213,624414,624512,571771
2,718780,L,2023-03-30,ATL,WSH,7,2,608331,571578,660670,...,606115,657041,608841,600869,642086,660688,669743,671277,645302,682928
3,718779,W,2023-03-30,PHI,TEX,7,11,605400,594798,607208,...,669016,543760,608369,663993,666969,673962,641680,543257,543543,669701
4,718778,L,2023-03-30,COL,SD,7,2,608566,605483,602074,...,678662,663757,665742,592518,593428,630105,572761,543592,673490,621311


In [46]:
def add_bat_stats_to_games(data, batting_stats, batter_stats = None):
    """
    Adds batting statistics to game-level data for each batter.
    """

    if batter_stats is None:
        batter_stats = ["ab", "bb", "hbp", "single", "double", "triple", "hr", "sf"]

    batters = ["away_b1", "away_b2", "away_b3", "away_b4", "away_b5", "away_b6", "away_b7", "away_b8", "away_b9", "home_b1", "home_b2", "home_b3", "home_b4", "home_b5", "home_b6", "home_b7", "home_b8", "home_b9"]
    batter_stats.append("batter")

    # Append stats for every batter and clean
    for batter in batters:
        data = data.merge(batting_stats, how="left", left_on= ["game_pk",batter], right_on=["game_pk","batter"], suffixes=("","_"+batter))
    
    data = data.rename(columns = lambda col: col + "_away_b1" if col in batter_stats else col)

    for batter in batters:
        data = data.drop(columns=["batter_" + batter])

    home_stats, away_stats = data.copy(), data.copy()
    home_stats = home_stats.loc[:, ~home_stats.columns.str.contains("away")]
    away_stats = away_stats.loc[:, ~away_stats.columns.str.contains("home")]

    home_stats.drop(columns = ['home_result', 'home_final_score'], inplace = True)
    away_stats.drop(columns = ['away_final_score'], inplace = True)

    for i in range(9):
        home_stats.drop(columns=f"home_b{i+1}", inplace=True)
        away_stats.drop(columns=f"away_b{i+1}", inplace=True)

    home_stats = home_stats.rename(
    columns={col: col.replace("home", "") for col in home_stats.columns if "home" in col}
)
    away_stats = away_stats.rename(
    columns={col: col.replace("away", "") for col in away_stats.columns if "away" in col}
)
    
    team_game_stats = pd.concat([home_stats, away_stats])
    team_game_stats.rename(columns = {"_team": "team"}, inplace = True)
    
    return team_game_stats

In [47]:
N = 10
YEAR = 2022
START_DATE = f'{YEAR}-04-07'
END_DATE = f'{YEAR}-10-05'

df = collect_pitch_level_data(start= START_DATE,end= END_DATE)

    
games = group_by_game(df)
batting_home_lineup, batting_away_lineup = pd.read_csv("batting_data_2022_home.csv"), pd.read_csv("batting_data_2022_away.csv")
batter_stats = pd.concat([batting_home_lineup, batting_away_lineup])

games_with_batting_stats = add_bat_stats_to_games(games, batter_stats)

This is a large query, it may take a moment to complete


100%|██████████| 182/182 [00:25<00:00,  7.02it/s]
  data_without_batters = df.groupby('game_pk',sort=False).apply(lambda group: pd.Series({
  away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
  home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)
  away_bs = df.groupby('game_pk',sort=False).apply(get_away_batting_order)
  home_bs = df.groupby('game_pk',sort=False).apply(get_home_batting_order)
