## Set Up

In [1]:
import pandas as pd

In [2]:
class ProcessPitcherData:
    def load_pitcher_data(self, pitcher_data_file):
        # load pitcher data
        pitcher_data = pd.read_csv(pitcher_data_file)
        return pitcher_data

## Data Exploration

In [3]:
# file path for pitcher data
pitcher_data_file = 'savant_pitch_level.csv'

# create instance of ProcessPitcherData class
process_pitcher_data = ProcessPitcherData()

# load pitcher data
pitcher_data = process_pitcher_data.load_pitcher_data(pitcher_data_file)

pitcher_data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,role_key,sp_indicator,rp_indicator,pitch_number_appearance,pitcher_at_bat_number,times_faced
0,FF,2021-04-05,93.2,0.84,5.74,"Duffy, Danny",592696,518633,strikeout,swinging_strike,...,Standard,166.0,-0.014,-0.134,SP,1,0,97,22,3
1,SL,2021-04-05,82.9,1.03,5.69,"Duffy, Danny",592696,518633,,foul,...,Standard,309.0,0.0,0.0,SP,1,0,96,22,3
2,FF,2021-04-05,94.6,-1.85,5.77,"Foltynewicz, Mike",543760,592314,caught_stealing_3b,ball,...,Standard,220.0,0.014,-0.202,SP,1,0,95,19,3
3,FF,2021-04-05,97.1,1.86,6.57,"Rodón, Carlos",657108,607074,field_out,hit_into_play,...,Standard,147.0,-0.007,-0.264,SP,1,0,95,22,3
4,FF,2021-04-05,93.6,0.6,5.85,"Duffy, Danny",592696,518633,,ball,...,Standard,164.0,0.0,0.043,SP,1,0,95,22,3


In [4]:
# Print shape of pitcher data
print(f"Savant data shape: {pitcher_data.shape}")

Savant data shape: (2136337, 91)


In [5]:
# Print the number of unique pitchers in the data
print(f"Number of unique pitchers: {pitcher_data['pitcher'].nunique()}")

Number of unique pitchers: 1383


In [6]:
pitcher_data['game_year'].value_counts()

2023    717945
2021    709852
2022    708540
Name: game_year, dtype: int64

In [7]:
# Find mean speed of each pitch type
pitch_speeds = pitcher_data.groupby('pitch_type')['release_speed'].mean()
# Print pitch speed
print(f"Pitch speed: {pitch_speeds}")

Pitch speed: pitch_type
CH    85.194678
CS    67.631683
CU    78.878982
EP    48.997253
FA    69.154792
FC    89.006994
FF    93.939048
FO    82.967738
FS    86.490498
KC    81.213525
KN    74.067986
PO    89.579167
SC    80.214667
SI    93.192988
SL    84.965771
ST    81.701641
SV    82.306737
Name: release_speed, dtype: float64


## SP deviation analysis

In [8]:
# Filter role key to only include starting pitchers
starting_pitcher_data = pitcher_data[pitcher_data['role_key'] == 'SP']
# Print average pitch velocity for starting pitchers
print(f"Average pitch velocity for starting pitchers: {starting_pitcher_data['release_speed'].mean()}")
# Print stndard deviation of pitch velocity for starting pitchers
print(f"Standard deviation of pitch velocity for starting pitchers: {starting_pitcher_data['release_speed'].std()}")
# Print average pitch appearance per game for starting pitchers
print(f"Average pitch appearance per game for starting pitchers: {starting_pitcher_data['pitch_number_appearance'].mean()}")

# Filter role key to only include relief pitchers
relief_pitcher_data = pitcher_data[pitcher_data['role_key'] == 'RP']
# Print average pitch velocity for relief pitchers
print(f"Average pitch velocity for relief pitchers: {relief_pitcher_data['release_speed'].mean()}")
# Print stndard deviation of pitch velocity for relief pitchers
print(f"Standard deviation of pitch velocity for relief pitchers: {relief_pitcher_data['release_speed'].std()}")
# Print average pitch appearance per game for relief pitchers
print(f"Average pitch appearance per game for relief pitchers: {relief_pitcher_data['pitch_number_appearance'].mean()}")


Average pitch velocity for starting pitchers: 88.4974181228329
Standard deviation of pitch velocity for starting pitchers: 5.88746313345347
Average pitch appearance per game for starting pitchers: 44.61787466676655
Average pitch velocity for relief pitchers: 89.4757325830808
Standard deviation of pitch velocity for relief pitchers: 6.343235393244449
Average pitch appearance per game for relief pitchers: 13.088986246709219


In [9]:
# Filter starting pitchers to only include FF pitch type
starting_pitcher_ff_data = starting_pitcher_data[starting_pitcher_data['pitch_type'] == 'FF']

# Find the mean pitch velocity for FF pitch type
starting_pitcher_ff_velocity = starting_pitcher_ff_data['release_speed'].mean()
# Print the mean pitch velocity for FF pitch type
print(f"Mean pitch velocity for FF pitch type: {starting_pitcher_ff_velocity}")



Mean pitch velocity for FF pitch type: 93.54146940278915


In [10]:
# Filter pitch appearances to first 10 pitches of starting pitchers
starting_pitcher_first_10_pitches = starting_pitcher_ff_data[starting_pitcher_ff_data['pitch_number_appearance'] < 20]

# Find the mean pitch velocity for FF pitch type
starting_pitcher_first_10_pitches_velocity = starting_pitcher_first_10_pitches['release_speed'].mean()
# Print the mean pitch velocity for FF pitch type
print(f"Mean pitch velocity for FF pitch type for the first ten pitches: {starting_pitcher_first_10_pitches_velocity}")

# Filter pitch appearances bigger than 10 pitches of starting pitchers
starting_pitcher_after_10_pitches = starting_pitcher_ff_data[starting_pitcher_ff_data['pitch_number_appearance'] > 10]

# Find the mean pitch velocity for FF pitch type
starting_pitcher_after_10_pitches_velocity = starting_pitcher_after_10_pitches['release_speed'].mean()
# Print the mean pitch velocity for FF pitch type
print(f"Mean pitch velocity for FF pitch type after 10 pitches: {starting_pitcher_after_10_pitches_velocity}")

Mean pitch velocity for FF pitch type for the first ten pitches: 93.67752536769005
Mean pitch velocity for FF pitch type after 10 pitches: 93.52855286711743


In [11]:
sp_ff_analysis = starting_pitcher_ff_data[['release_speed', 'pitch_number_appearance', 'player_name']].copy()

# Combine pitch number appearance for each pitcher by taking the mean of release speed
sp_ff_analysis = sp_ff_analysis.groupby(['pitch_number_appearance', 'player_name'])['release_speed'].mean().reset_index()

# Rename release speed column
sp_ff_analysis = sp_ff_analysis.rename(columns={'release_speed': 'avg_release_speed'})

sp_ff_analysis.head(10)


Unnamed: 0,pitch_number_appearance,player_name,avg_release_speed
0,1,"Abbott, Andrew",92.880952
1,1,"Abbott, Cory",90.825
2,1,"Adon, Joan",94.348
3,1,"Akin, Keegan",92.15
4,1,"Alcantara, Sandy",96.9
5,1,"Alexander, Tyler",89.845455
6,1,"Alexy, A.J.",93.75
7,1,"Allard, Kolby",91.121053
8,1,"Allen, Logan",91.871875
9,1,"Alzolay, Adbert",92.583333


In [12]:
# Find the standard deviation of release speed
sp_ff_analysis_std = sp_ff_analysis['avg_release_speed'].std()

# print standard deviation of release speed
print(f"Standard deviation of release speed: {sp_ff_analysis_std}")

Standard deviation of release speed: 2.2556643217550785


In [13]:
# Find the players with the highest standard deviation of pitch velocity
sp_ff_analysis_std = sp_ff_analysis.groupby('player_name')['avg_release_speed'].std().reset_index()

# Rename release speed column
sp_ff_analysis_std = sp_ff_analysis_std.rename(columns={'avg_release_speed': 'std_release_speed'})

# Sort values by standard deviation of pitch velocity
sp_ff_analysis_std = sp_ff_analysis_std.sort_values('std_release_speed', ascending=False)

# Print the top 5 players with the highest standard deviation of pitch velocity
sp_ff_analysis_std.head()

Unnamed: 0,player_name,std_release_speed
428,"Ramírez, Erasmo",3.001527
465,"Santiago, Héctor",2.13988
526,"Thornton, Trent",1.758557
497,"Staumont, Josh",1.687207
391,"Oviedo, Luis",1.639529


In [23]:
# Find the mean for each player release speed over the first 10 pitches
sp_ff_analysis_first_10 = sp_ff_analysis[sp_ff_analysis['pitch_number_appearance'] < 10]

# Combine pitch number appearance for each pitcher by taking the mean of release speed
sp_ff_analysis_first_10 = sp_ff_analysis_first_10.groupby('player_name')['avg_release_speed'].mean().reset_index()

# Rename release speed column
sp_ff_analysis_first_10 = sp_ff_analysis_first_10.rename(columns={'avg_release_speed': 'avg_release_speed_first_10'})

sp_ff_analysis_first_10.head()

Unnamed: 0,player_name,avg_release_speed_first_10
0,"Abbott, Andrew",93.353304
1,"Abbott, Cory",91.500053
2,"Adon, Joan",95.210389
3,"Akin, Keegan",92.466828
4,"Albers, Andrew",88.522222


In [25]:
# Find the mean for each player release speed over the first 10 pitches
sp_ff_analysis_after_10 = sp_ff_analysis[sp_ff_analysis['pitch_number_appearance'] > 10]

# Combine pitch number appearance for each pitcher by taking the mean of release speed
sp_ff_analysis_after_10 = sp_ff_analysis_after_10.groupby('player_name')['avg_release_speed'].mean().reset_index()

# Rename release speed column
sp_ff_analysis_after_10 = sp_ff_analysis_after_10.rename(columns={'avg_release_speed': 'avg_release_speed_after_10'})

sp_ff_analysis_after_10.head()



Unnamed: 0,player_name,avg_release_speed_after_10
0,"Abbott, Andrew",92.589819
1,"Abbott, Cory",91.179971
2,"Adon, Joan",94.776458
3,"Akin, Keegan",91.831292
4,"Albers, Andrew",88.081217


In [26]:
# Merge first 10 pitches and after 10 pitches dataframes
sp_ff_analysis_merged = pd.merge(sp_ff_analysis_first_10, sp_ff_analysis_after_10, on='player_name')

# Print the first 5 rows of the merged dataframe
sp_ff_analysis_merged.head()

Unnamed: 0,player_name,avg_release_speed_first_10,avg_release_speed_after_10
0,"Abbott, Andrew",93.353304,92.589819
1,"Abbott, Cory",91.500053,91.179971
2,"Adon, Joan",95.210389,94.776458
3,"Akin, Keegan",92.466828,91.831292
4,"Albers, Andrew",88.522222,88.081217


In [27]:
# Subtract the first 10 pitches average release speed from the after 10 pitches average release speed
sp_ff_analysis_merged['diff_release_speed'] = sp_ff_analysis_merged['avg_release_speed_after_10'] - sp_ff_analysis_merged['avg_release_speed_first_10']

# Sort values by difference in release speed
sp_ff_analysis_merged = sp_ff_analysis_merged.sort_values('diff_release_speed', ascending=False)

# Print the top 5 players with the highest difference in release speed
sp_ff_analysis_merged.head()

Unnamed: 0,player_name,avg_release_speed_first_10,avg_release_speed_after_10,diff_release_speed
412,"Ramírez, Erasmo",88.5,94.0,5.5
480,"Staumont, Josh",95.033333,97.7,2.666667
236,"Jiménez, Dany",93.25,94.8,1.55
432,"Rondón, Angel",91.314286,92.7875,1.473214
427,"Rodón, Carlos",94.44648,95.830771,1.384291


## Hunter Greene Analysis

In [16]:
# filter data to 2023
pitcher_data_2023 = pitcher_data[pitcher_data['game_year'] == 2023]

# Print shape of pitcher data
print(f"Savant data shape: {pitcher_data_2023.shape}")

# Print the number of unique pitchers in the data
print(f"Number of unique pitchers: {pitcher_data_2023['pitcher'].nunique()}")



Savant data shape: (717945, 91)
Number of unique pitchers: 863


In [17]:
# Print stats of players that have name hunter
Hunter = pitcher_data_2023[pitcher_data_2023['player_name'].str.contains('Greene, Hunter')]

# Print shape of pitcher data
print(f"Savant data shape: {Hunter.shape}")

Hunter.head()


Savant data shape: (2089, 91)


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,role_key,sp_indicator,rp_indicator,pitch_number_appearance,pitcher_at_bat_number,times_faced
1419963,FF,2023-03-30,101.0,-2.59,5.84,"Greene, Hunter",595978,668881,walk,ball,...,Standard,226.0,-0.016,0.293,SP,1,0,83,18,2
1419976,FF,2023-03-30,100.5,-2.57,5.94,"Greene, Hunter",595978,668881,,ball,...,Standard,231.0,0.0,0.107,SP,1,0,82,18,2
1420002,FF,2023-03-30,101.4,-2.74,5.98,"Greene, Hunter",595978,668881,,foul,...,Standard,226.0,0.0,0.0,SP,1,0,81,18,2
1420029,SL,2023-03-30,91.0,-2.61,5.88,"Greene, Hunter",595978,668881,,ball,...,Standard,120.0,0.0,0.033,SP,1,0,80,18,2
1420057,FF,2023-03-30,100.1,-2.65,6.02,"Greene, Hunter",595978,668881,,ball,...,Standard,223.0,0.0,-0.013,SP,1,0,79,18,2


In [18]:
# Print average pitch number apperance
print(f"Average pitch number appearance: {Hunter['pitch_number_appearance'].mean()}")

Average pitch number appearance: 49.05552896122547


In [19]:
# Find the mean of the effective speed column
print(f"Mean of effective speed: {Hunter['release_speed'].mean()}")

Mean of effective speed: 93.50622605363996


In [20]:
# print the pitch number column

hunter_first_pitch = Hunter[Hunter['pitch_number_appearance'] < 5] 

# Print shape of pitcher data
print(f"Savant data shape: {hunter_first_pitch.shape}")

# Print the mean of the release speed column
print(f"Mean of release speed: {hunter_first_pitch['release_speed'].mean()}")

hunter_first_pitch.head()

Savant data shape: (88, 91)
Mean of release speed: 94.63409090909094


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,role_key,sp_indicator,rp_indicator,pitch_number_appearance,pitcher_at_bat_number,times_faced
1423789,SL,2023-03-30,89.5,-2.65,5.96,"Greene, Hunter",457705,668881,,foul,...,Standard,125.0,0.0,-0.05,SP,1,0,4,3,1
1423920,SL,2023-03-30,89.6,-2.77,5.95,"Greene, Hunter",668804,668881,single,hit_into_play,...,Standard,145.0,-0.025,0.238,SP,1,0,3,2,1
1424152,FF,2023-03-30,100.1,-2.65,6.02,"Greene, Hunter",668804,668881,,ball,...,Standard,224.0,0.0,0.025,SP,1,0,2,2,1
1424295,FF,2023-03-30,100.5,-2.56,6.14,"Greene, Hunter",665833,668881,field_out,hit_into_play,...,Standard,221.0,0.022,-0.238,SP,1,0,1,1,1
1462216,FF,2023-04-07,99.3,-2.67,6.08,"Greene, Hunter",607208,668881,strikeout,swinging_strike,...,Standard,220.0,-0.021,-0.173,SP,1,0,4,1,1


In [21]:
# print the pitch number that is not 1

hunter_not_first_pitch = Hunter[Hunter['pitch_number_appearance'] > 5]

# Print shape of pitcher data
print(f"Savant data shape: {hunter_not_first_pitch.shape}")

# Print the mean of the release speed column
print(f"Mean of release speed: {hunter_not_first_pitch['release_speed'].mean()}")

hunter_not_first_pitch.head()

Savant data shape: (1979, 91)
Mean of release speed: 93.48549039433784


Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,role_key,sp_indicator,rp_indicator,pitch_number_appearance,pitcher_at_bat_number,times_faced
1419963,FF,2023-03-30,101.0,-2.59,5.84,"Greene, Hunter",595978,668881,walk,ball,...,Standard,226.0,-0.016,0.293,SP,1,0,83,18,2
1419976,FF,2023-03-30,100.5,-2.57,5.94,"Greene, Hunter",595978,668881,,ball,...,Standard,231.0,0.0,0.107,SP,1,0,82,18,2
1420002,FF,2023-03-30,101.4,-2.74,5.98,"Greene, Hunter",595978,668881,,foul,...,Standard,226.0,0.0,0.0,SP,1,0,81,18,2
1420029,SL,2023-03-30,91.0,-2.61,5.88,"Greene, Hunter",595978,668881,,ball,...,Standard,120.0,0.0,0.033,SP,1,0,80,18,2
1420057,FF,2023-03-30,100.1,-2.65,6.02,"Greene, Hunter",595978,668881,,ball,...,Standard,223.0,0.0,-0.013,SP,1,0,79,18,2


In [22]:
view = Hunter[['release_speed', "pitch_number_appearance"]]

# sort release speed
view.sort_values(by=['release_speed'], ascending=False).head(20)

Unnamed: 0,release_speed,pitch_number_appearance
1420668,102.1,57
1421520,102.0,30
1421101,101.9,42
1422402,101.8,15
1476330,101.7,73
1420442,101.6,65
1420456,101.6,64
1422536,101.4,14
1476188,101.4,79
1459965,101.4,36
