## Set Up

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
class ProcessPitcherData:
    def load_pitcher_data(self, pitcher_data_file):
        # load pitcher data
        pitcher_data = pd.read_csv(pitcher_data_file)
        return pitcher_data

In [3]:
# file path for pitcher data
pitcher_data_file = 'savant_pitch_level.csv'

# create instance of ProcessPitcherData class
process_pitcher_data = ProcessPitcherData()

# load pitcher data
pitcher_data = process_pitcher_data.load_pitcher_data(pitcher_data_file)

pitcher_data.head()

Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp,role_key,sp_indicator,rp_indicator,pitch_number_appearance,pitcher_at_bat_number,times_faced
0,FF,2021-04-05,93.2,0.84,5.74,"Duffy, Danny",592696,518633,strikeout,swinging_strike,...,Standard,166.0,-0.014,-0.134,SP,1,0,97,22,3
1,SL,2021-04-05,82.9,1.03,5.69,"Duffy, Danny",592696,518633,,foul,...,Standard,309.0,0.0,0.0,SP,1,0,96,22,3
2,FF,2021-04-05,94.6,-1.85,5.77,"Foltynewicz, Mike",543760,592314,caught_stealing_3b,ball,...,Standard,220.0,0.014,-0.202,SP,1,0,95,19,3
3,FF,2021-04-05,97.1,1.86,6.57,"Rodón, Carlos",657108,607074,field_out,hit_into_play,...,Standard,147.0,-0.007,-0.264,SP,1,0,95,22,3
4,FF,2021-04-05,93.6,0.6,5.85,"Duffy, Danny",592696,518633,,ball,...,Standard,164.0,0.0,0.043,SP,1,0,95,22,3


## Data Exporation

In [4]:
# Filter pitcher to season 2023
pitcher_data = pitcher_data[pitcher_data['game_year'] == 2023]

# # Filter pitcher over 100 pitches
# pitcher_data = pitcher_data[pitcher_data['pitch_number_appearance'] > 100]

In [5]:
#  
pitcher_delta_df = pitcher_data[['player_name', 'delta_run_exp', 'pitch_number_appearance']]

pitcher_delta_df.head()

Unnamed: 0,player_name,delta_run_exp,pitch_number_appearance
1418392,"Martinez, Nick",-0.299,101
1418393,"Martinez, Nick",0.0,100
1418394,"Martinez, Nick",0.0,99
1418395,"Martinez, Nick",0.055,98
1418396,"Martinez, Nick",-0.053,97


In [6]:
# Group all the pitches by pitcher and pitch number appearance
# Calculate the average delta_run_exp for each pitcher and pitch number appearance
pitcher_delta_df = pitcher_delta_df.groupby(['player_name', 'pitch_number_appearance'])['delta_run_exp'].mean().reset_index()

pitcher_delta_df.head()

Unnamed: 0,player_name,pitch_number_appearance,delta_run_exp
0,"Abad, Fernando",1,0.207833
1,"Abad, Fernando",2,0.005167
2,"Abad, Fernando",3,0.012167
3,"Abad, Fernando",4,0.042833
4,"Abad, Fernando",5,0.002


In [7]:
def calculate_delta_stats(pitcher_delta_df ):

    delta_grade_df = pd.DataFrame(columns=['player_name', 'delta_diff', 'delta_avg'])

    for player in pitcher_delta_df['player_name'].unique():
        player_data = pitcher_delta_df[pitcher_delta_df['player_name'] == player]

        # Find the mean pitcher appearance 
        mean_pitcher_appearance = player_data['pitch_number_appearance'].mean()

        # filter out early pitcher appearances
        player_early_pitches = player_data[player_data['pitch_number_appearance'] < mean_pitcher_appearance]

        # filter out late pitcher appearances
        player_late_pitches = player_data[player_data['pitch_number_appearance'] >= mean_pitcher_appearance]

        # calculate the mean score for early pitcher appearances
        mean_early_pitcher_score = player_early_pitches['delta_run_exp'].mean()

        # calculate the mean score for late pitcher appearances
        mean_late_pitcher_score = player_late_pitches['delta_run_exp'].mean()

        # calculate the grade
        delta_diff = abs(mean_early_pitcher_score - mean_late_pitcher_score)

        # add the delta score to the delta_grade_df
        delta_avg = player_data['delta_run_exp'].mean()

        delta_grade_df = pd.concat([delta_grade_df, pd.DataFrame([[player, delta_diff, delta_avg]], columns=['player_name', 'delta_diff', 'delta_avg'])])
    return  delta_grade_df

In [8]:

delta_grade_df = calculate_delta_stats(pitcher_delta_df)

delta_grade_df.head()

Unnamed: 0,player_name,delta_diff,delta_avg
0,"Abad, Fernando",0.084875,0.045578
0,"Abbott, Andrew",0.006984,-0.004907
0,"Abbott, Cory",0.005908,0.016924
0,"Abreu, Albert",0.014791,0.003354
0,"Abreu, Bryan",0.007405,-0.020875


In [9]:
delta_grade_df['delta_avg'].describe()

count    863.000000
mean       0.006647
std        0.036388
min       -0.085667
25%       -0.012279
50%        0.001598
75%        0.016964
max        0.272900
Name: delta_avg, dtype: float64

In [10]:
# Find string contains player Ohtani
ohtani = delta_grade_df[delta_grade_df['player_name'].str.contains('Ohtani')]

ohtani

Unnamed: 0,player_name,delta_diff,delta_avg
0,"Ohtani, Shohei",0.003818,-0.009127


In [11]:
# Find player Gerrit Cole
gerrit_cole = delta_grade_df[delta_grade_df['player_name'].str.contains('Gerrit')]

gerrit_cole

Unnamed: 0,player_name,delta_diff,delta_avg
0,"Cole, Gerrit",0.000144,-0.013463


In [12]:
# Find player Max Scherzer
max_scherzer = delta_grade_df[delta_grade_df['player_name'].str.contains('Scherzer')]

max_scherzer

Unnamed: 0,player_name,delta_diff,delta_avg
0,"Scherzer, Max",0.007383,-0.006513


In [13]:
# Sort delta_avg from lowest to highest, include negative values
delta_grade_df = delta_grade_df.sort_values(by=['delta_avg'], ascending=True)

delta_grade_df.head(60)

# Sort delta_avg from highest to lowest


Unnamed: 0,player_name,delta_diff,delta_avg
0,"Williams, Luke",0.012,-0.085667
0,"Stephenson, Robert",0.132329,-0.080487
0,"Dixon, Brandon",0.010583,-0.073286
0,"Vieira, Thyago",0.054542,-0.072396
0,"Schoop, Jonathan",0.111389,-0.064529
0,"Scott, Tanner",0.087789,-0.058976
0,"Bautista, Félix",0.059172,-0.052545
0,"Uribe, Abner",0.097747,-0.052375
0,"Kimbrel, Craig",0.093141,-0.052338
0,"Hernández, Yonny",0.0934,-0.0515


In [14]:
# Drop any columns with NaN values
delta_grade_df = delta_grade_df.dropna()

# Sort delta_avg from lowest to highest, include negative values
delta_grade_df = delta_grade_df.sort_values(by=['delta_avg'], ascending=True)

delta_grade_df.head(60)

Unnamed: 0,player_name,delta_diff,delta_avg
0,"Williams, Luke",0.012,-0.085667
0,"Stephenson, Robert",0.132329,-0.080487
0,"Dixon, Brandon",0.010583,-0.073286
0,"Vieira, Thyago",0.054542,-0.072396
0,"Schoop, Jonathan",0.111389,-0.064529
0,"Scott, Tanner",0.087789,-0.058976
0,"Bautista, Félix",0.059172,-0.052545
0,"Uribe, Abner",0.097747,-0.052375
0,"Kimbrel, Craig",0.093141,-0.052338
0,"Hernández, Yonny",0.0934,-0.0515


In [15]:
# make a new column for MLBAMID
delta_grade_df['MLBAMID'] = ''


# Find MLBAMID for each player
for i in range(len(delta_grade_df)):
    player = delta_grade_df.iloc[i, 0]
    player_id = pitcher_data[pitcher_data['player_name'] == player]['pitcher'].unique()
    delta_grade_df.iloc[i, 3] = player_id[0]

delta_grade_df.head()




Unnamed: 0,player_name,delta_diff,delta_avg,MLBAMID
0,"Williams, Luke",0.012,-0.085667,663897
0,"Stephenson, Robert",0.132329,-0.080487,596112
0,"Dixon, Brandon",0.010583,-0.073286,641525
0,"Vieira, Thyago",0.054542,-0.072396,600986
0,"Schoop, Jonathan",0.111389,-0.064529,570731


In [16]:
# # Export the delta_grade_df to a csv file
delta_grade_df.to_csv('pitcher_delta_grade.csv', index=False)



In [17]:
# # normalize the delta_diff and delta_avg columns
# scaler = StandardScaler()

# delta_grade_df[['delta_diff', 'delta_avg']] = scaler.fit_transform(delta_grade_df[['delta_diff', 'delta_avg']])

# delta_grade_df.head()


# # Sort by delta_avg
# delta_grade_df.sort_values(by=['delta_avg'], ascending=True, inplace=True)

# delta_grade_df.head()
