In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pybaseball import statcast, chadwick_register, playerid_reverse_lookup
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix

In [2]:
#df = statcast(start_dt = '2022-04-01', end_dt = '2025-03-26')

# drop any pitch types that are null 
#df = df.dropna(subset=['pitch_type'])

# reset index (for model preparation)
#df = df.reset_index(drop=True)

df = pd.read_csv('baseball.csv')

# print the shape of the data 
df.shape

(2256509, 113)

In [3]:
#df.to_csv('baseball.csv', index=False)

In [6]:
df.rename(columns={'player_name': 'pitcher_name'}, inplace=True)
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,n_thruorder_pitcher,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,1,1,,,,,1.23,1.44,-1.44,
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,1,1,,,,,1.07,1.35,-1.35,
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,1,1,,,,,1.25,1.02,1.02,
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,1,1,,,,,1.22,0.82,0.82,
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,1,1,,,,,0.99,1.11,1.11,


In [None]:
chadwick = chadwick_register()

player_mapping =  chadwick[['key_mlbam', 'name_last', 'name_first']]
player_mapping['batter_name'] = player_mapping['name_last'] +', ' + player_mapping['name_first']
player_mapping = player_mapping[['key_mlbam', 'batter_name']].rename(columns={'key_mlbam': 'batter'})

df = df.merge(
    player_mapping,
    on='batter',
    how='left'
)


Gathering player lookup table. This may take a moment.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_mapping['batter_name'] = player_mapping['name_last'] +', ' + player_mapping['name_first']


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,batter_name
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,1,,,,,1.23,1.44,-1.44,,
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,1,,,,,1.07,1.35,-1.35,,
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,1,,,,,1.25,1.02,1.02,,"Peraza, Oswald"
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,1,,,,,1.22,0.82,0.82,,"Peraza, Oswald"
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,1,,,,,0.99,1.11,1.11,,"Peraza, Oswald"


In [10]:
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,batter_name
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,1,,,,,1.23,1.44,-1.44,,
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,1,,,,,1.07,1.35,-1.35,,
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,1,,,,,1.25,1.02,1.02,,"Peraza, Oswald"
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,1,,,,,1.22,0.82,0.82,,"Peraza, Oswald"
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,1,,,,,0.99,1.11,1.11,,"Peraza, Oswald"


In [11]:
missing_batters = df[df['batter_name'].isna()]['batter'].unique()

# Fetch missing names
missing_names = playerid_reverse_lookup(missing_batters, key_type='mlbam')[['key_mlbam', 'name_last', 'name_first']]
missing_names['batter_name'] = missing_names['name_last'] + ', ' + missing_names['name_first']

df = df.merge(
    missing_names[['key_mlbam', 'batter_name']].rename(columns={'key_mlbam': 'batter'}),
    on='batter',
    how='left',
    suffixes=('', '_new')
)

# Combine columns
df['batter_name'] = df['batter_name'].fillna(df['batter_name_new'])
df.drop(columns=['batter_name_new'], inplace=True)

Gathering player lookup table. This may take a moment.


In [14]:
df.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,pitcher_name,batter,pitcher,events,description,...,n_priorpa_thisgame_player_at_bat,pitcher_days_since_prev_game,batter_days_since_prev_game,pitcher_days_until_next_game,batter_days_until_next_game,api_break_z_with_gravity,api_break_x_arm,api_break_x_batter_in,arm_angle,batter_name
0,SI,2025-03-25,98.3,-2.47,5.22,"Faherty, Jake",665998,801619,field_out,hit_into_play,...,1,,,,,1.23,1.44,-1.44,,
1,SI,2025-03-25,98.6,-2.48,5.11,"Faherty, Jake",665998,801619,,called_strike,...,1,,,,,1.07,1.35,-1.35,,
2,SI,2025-03-25,100.7,-2.27,5.29,"Faherty, Jake",672724,801619,field_out,hit_into_play,...,1,,,,,1.25,1.02,1.02,,"Peraza, Oswald"
3,SI,2025-03-25,99.7,-2.22,5.26,"Faherty, Jake",672724,801619,,swinging_strike,...,1,,,,,1.22,0.82,0.82,,"Peraza, Oswald"
4,SI,2025-03-25,98.6,-2.44,5.19,"Faherty, Jake",672724,801619,,swinging_strike,...,1,,,,,0.99,1.11,1.11,,"Peraza, Oswald"


In [15]:
missing_ids = df[df['batter_name'].isna()]['batter'].unique()
print(f"Missing batter IDs: {missing_ids}")

Missing batter IDs: [665998 682987 702906 ... 621520 666206 672692]


In [16]:
import requests

def get_batter_name(mlbam_id):
    try:
        url = f"https://statsapi.mlb.com/api/v1/people/{mlbam_id}"
        response = requests.get(url).json()
        return f"{response['people'][0]['lastName']}, {response['people'][0]['firstName']}"
    except:
        return "Unknown"

# Apply to missing IDs
for bid in missing_ids:
    df.loc[df['batter'] == bid, 'batter_name'] = get_batter_name(bid)

In [18]:
# Identify non-player batter IDs (e.g., 0, negative numbers, or IDs not in any registry)
invalid_batters = df[
    df['batter_name'].isna() & 
    (~df['batter'].between(100000, 999999))  # Valid MLBAM IDs are 6-digit numbers
]

# Handle invalid cases
df['batter_name'] = np.where(
    df['batter_name'].isna() & df['batter'].between(100000, 999999),
    "Unknown Player",
    df['batter_name']
)

In [19]:
print(f"Remaining missing batter names: {df['batter_name'].isna().sum()}")

Remaining missing batter names: 0


In [22]:
df.to_csv('cleaned_baseball.csv', index=False)