# Predicting MLB Slugging Percentage

### April 26, 2025

Import packages

In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Load datasets

In [34]:
df_general = pd.read_csv('Data/avg_stats.csv')
df_bat_track = pd.read_csv('Data/bat-tracking.csv')
df_exit_velo = pd.read_csv('Data/exit_velocity.csv')

#### Prepare Data

This data was downloaded from MLB's official statistics branch, Baseball Savant (https://baseballsavant.mlb.com/). There are three total files containing:

* general hitting statistics like plate appearance and batting average
* bat tracking stastics like average bat speed
* exit velocity

Before exploring all of the data, I first want to merge the three files into a single dataframe. Let's make sure we only include players that appear in all 3 datasets.

In [29]:
print(df_general.shape)
print(df_bat_track.shape)
print(df_exit_velo.shape)

(252, 14)
(214, 18)
(252, 18)


Since the number of rows are different in the bat tracking dataframe, I'll explore the unique names, and remove names that don't appear in the larger datasets.

In [31]:
df_bat_track['name'].unique().size

214

It is safe to assume that each entry is unique. I will now remove names from the other datasets and merge all 3.

In [None]:
# list of names to keep
names = df_bat_track['name'].unique()

# dropping names with boolean mask
df_general = df_general[df_general['last_name, first_name'].isin(names)]
df_exit_velo = df_exit_velo[df_exit_velo['last_name, first_name'].isin(names)]

print(df_general.shape)
print(df_exit_velo.shape)

(214, 14)
(214, 18)


In [39]:
# rename columns to make merge easier
df_general.rename(columns={'last_name, first_name' : 'name'}, inplace=True)
df_exit_velo.rename(columns={'last_name, first_name' : 'name'}, inplace=True)

# merge on name column
df = pd.merge(df_general, df_exit_velo, on='name', how='inner')
df = pd.merge(df, df_bat_track, on='name', how='inner')

df.head(10)

Unnamed: 0,name,player_id_x,year,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,...,squared_up_per_swing,blast_per_bat_contact,blast_per_swing,swing_length,swords,batter_run_value,whiffs,whiff_per_swing,batted_ball_events,batted_ball_event_per_swing
0,"Duran, Jarren",680776,2024,735,515,0.285,0.271,0.014,0.492,0.448,...,0.231986,0.177474,0.137083,7.55794,21,8.23781,259,0.227592,447,0.392794
1,"Ohtani, Shohei",660271,2024,731,479,0.31,0.314,-0.004,0.646,0.66,...,0.274579,0.257783,0.183348,7.740924,10,35.103536,326,0.288751,420,0.372011
2,"Henderson, Gunnar",683002,2024,719,475,0.281,0.283,-0.002,0.529,0.492,...,0.25248,0.228111,0.178539,7.233486,20,-5.457286,241,0.217313,429,0.386835
3,"Semien, Marcus",543760,2024,718,546,0.237,0.251,-0.014,0.391,0.391,...,0.306655,0.119912,0.098022,7.656666,27,-33.790273,203,0.182554,482,0.433453
4,"Soto, Juan",665742,2024,713,461,0.288,0.316,-0.028,0.569,0.646,...,0.319218,0.252674,0.205212,7.287239,28,13.588277,173,0.187839,409,0.444083
5,"Witt Jr., Bobby",677951,2024,709,538,0.332,0.315,0.017,0.588,0.577,...,0.274527,0.204571,0.161116,7.096457,16,19.805583,236,0.212421,468,0.421242
6,"Judge, Aaron",592450,2024,704,390,0.322,0.31,0.012,0.701,0.723,...,0.241584,0.257975,0.184158,8.178268,14,52.210148,289,0.286139,343,0.339604
7,"Guerrero Jr., Vladimir",665489,2024,697,524,0.323,0.321,0.002,0.544,0.567,...,0.277247,0.241667,0.194073,7.73941,9,24.317748,206,0.196941,443,0.423518
8,"De La Cruz, Elly",682829,2024,696,403,0.259,0.24,0.019,0.471,0.428,...,0.221436,0.18958,0.132457,7.281976,31,2.83253,298,0.301314,350,0.353893
9,"Tovar, Ezequiel",678662,2024,695,467,0.269,0.233,0.036,0.469,0.401,...,0.20856,0.11758,0.080156,7.545632,17,-4.912897,409,0.318288,405,0.315175
