In [1]:
from pybaseball import statcast_batter_expected_stats
from pybaseball import pitching_stats
from pybaseball import batting_stats
from pybaseball import statcast_batter
from pybaseball import playerid_lookup
import pandas as pd
from scipy.stats import zscore
from pybaseball import cache

## Batters by wBOA ZScore

In [2]:
# retrieve data on only players who have 50+ plate appearances this year
full_batting_2019 = batting_stats(2019, qual=150)
full_batting_2021 = batting_stats(2021, qual=150)
full_batting_2022 = batting_stats(2022, qual=150)
full_batting_2023 = batting_stats(2023, qual=150)

# Concatenate the DataFrames vertically
combined_batting_data = pd.concat([full_batting_2019,full_batting_2021, full_batting_2022, full_batting_2023], ignore_index=True)

In [3]:
print("Shape of the DataFrame:", combined_batting_data.shape)
print("Column names:", combined_batting_data.columns.tolist())

Shape of the DataFrame: (1596, 320)
Column names: ['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG', 'GB', 'FB', 'LD', 'IFFB', 'Pitches', 'Balls', 'Strikes', 'IFH', 'BU', 'BUH', 'BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'Fld', 'Rep', 'Pos', 'RAR', 'WAR', 'Dol', 'Spd', 'wRC+', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'phLI', 'PH', 'WPA/LI', 'Clutch', 'FB% (Pitch)', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'BsR', 'FA% (sc)', 'FT% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL%

In [4]:
# Filter the DataFrame for the relevant years (2019, 2021, 2022)
relevant_years = [2019, 2021, 2022]
filtered_data_batting = combined_batting_data[combined_batting_data['Season'].isin(relevant_years)]

# Group the data by player and calculate the average wOBA for relevant years
grouped_data = filtered_data_batting.groupby(['IDfg', 'Name']).mean().reset_index().rename(columns={'wOBA': 'avg_wOBA'})

# Merge the average wOBA with the 2023 wOBA for each player
woba_2023 = full_batting_2023[['IDfg', 'Name', 'wOBA']].rename(columns={'wOBA': 'wOBA_2023'})


# Merge player names with the result DataFrame
# Merge player names and calculated results
players_off_output = pd.merge(
    grouped_data,
    woba_2023,
    on=['IDfg', 'Name']
)

  grouped_data = filtered_data_batting.groupby(['IDfg', 'Name']).mean().reset_index().rename(columns={'wOBA': 'avg_wOBA'})


In [5]:
# Drop Season Column
players_off_output = players_off_output.drop(columns='Season')
pd.set_option('display.float_format', '{:.2f}'.format)

# Reorder columns to have 'avg_wOBA' and 'wOBA_2023' next to each other at the end
column_order = [
    col for col in players_off_output.columns if col not in ['avg_wOBA', 'wOBA_2023']
] + ['avg_wOBA', 'wOBA_2023']

players_off_output = players_off_output[column_order]

In [6]:
avg_wOBA = players_off_output['avg_wOBA']
wOBA_2023 = players_off_output['wOBA_2023']

z_scores_avg_woba = zscore(avg_wOBA)
z_score_woba_2023 = zscore(wOBA_2023)

In [7]:
# # Calculate z-scores for avg_wOBA and wOBA_2023
# players_off_output['zscore_avg_wOBA'] = z_scores_avg_woba 
# players_off_output['zscore_wOBA_2023'] = z_score_woba_2023

# Calculate the difference between z-scores
# players_off_output['zscore_difference'] = players_off_output['zscore_wOBA_2023'] - players_off_output['zscore_avg_wOBA']

players_off_output['zscore_difference'] = z_score_woba_2023 - z_scores_avg_woba 
# Display the result
players_off_output

Unnamed: 0,IDfg,Name,Age,G,AB,PA,H,1B,2B,3B,...,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR,avg_wOBA,wOBA_2023,zscore_difference
0,1744,Miguel Cabrera,37.67,126.00,454.00,502.67,120.33,94.00,15.67,0.00,...,351.33,0.15,0.27,0.25,0.40,0.31,-0.77,0.30,0.29,0.06
1,2136,David Peralta,32.67,127.67,436.00,483.67,113.67,68.67,29.67,4.67,...,341.33,0.16,0.27,0.25,0.37,0.30,1.27,0.32,0.31,-0.19
2,2396,Carlos Santana,34.67,149.00,523.00,617.00,123.00,77.67,21.00,0.33,...,426.67,0.17,0.25,0.25,0.45,0.35,1.77,0.33,0.31,-0.45
3,2434,Nelson Cruz,39.67,128.00,471.67,537.33,127.33,78.33,21.00,0.33,...,354.00,0.13,0.27,0.26,0.52,0.37,1.80,0.35,0.29,-1.62
4,2967,Tommy Pham,32.67,148.00,532.00,612.33,131.67,85.67,26.67,1.67,...,396.00,0.20,0.29,0.25,0.43,0.34,1.73,0.32,0.34,0.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,27465,Spencer Torkelson,22.00,110.00,360.00,404.00,73.00,48.00,16.00,1.00,...,263.00,0.17,0.28,0.23,0.38,0.30,-0.90,0.27,0.31,1.40
300,27506,Ha-seong Kim,25.50,133.50,392.00,440.00,92.00,59.50,20.50,2.50,...,312.50,0.21,0.29,0.23,0.34,0.29,2.30,0.29,0.36,2.04
301,27676,Vinnie Pasquantino,24.00,72.00,258.00,298.00,76.00,56.00,10.00,0.00,...,226.00,0.16,0.23,0.29,0.47,0.37,1.50,0.36,0.33,-1.10
302,27684,Michael Massey,24.00,52.00,173.00,194.00,42.00,28.00,9.00,1.00,...,131.00,0.13,0.27,0.24,0.45,0.33,0.80,0.30,0.28,-0.34


In [8]:
 #Write the DataFrame to a CSV file
players_off_output.to_csv('Resources/off_output_for_leaning.csv', index=False)

## Pitchers by zERA ZScore

In [9]:
from pybaseball import statcast_pitcher_expected_stats

# Get data for all pitchers with a minimum of 150 plate appearances against in  2019, 2021, 2022, 2023
pitcher_data_2019 = statcast_pitcher_expected_stats(2019, 150)
pitcher_data_2021 = statcast_pitcher_expected_stats(2021, 150)
pitcher_data_2022 = statcast_pitcher_expected_stats(2022, 150)
pitcher_data_2023 = statcast_pitcher_expected_stats(2023, 150)

# Concatenate the DataFrames
pitcher_combined_data = pd.concat([pitcher_data_2019, pitcher_data_2021, pitcher_data_2022, pitcher_data_2023], ignore_index=True)

# # Concatenate the last_name and first_name columns to create a new 'Name' column
# pitcher_combined_data['Name'] = pitcher_combined_data['first_name'] + ' ' + pitcher_combined_data['last_name']

# # Drop the individual 'last_name' and 'first_name' columns
# pitcher_combined_data.drop(columns=['last_name', 'first_name'], inplace=True)

# Display the modified DataFrame
pitcher_combined_data

Unnamed: 0,last_name,first_name,player_id,year,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,est_slg_minus_slg_diff,woba,est_woba,est_woba_minus_woba_diff,era,xera,era_minus_xera_diff
0,Bauer,Trevor,545333,2019,911,557,0.23,0.22,0.01,0.43,0.40,0.03,0.32,0.31,0.01,4.48,4.23,0.25
1,Lynn,Lance,458681,2019,875,562,0.24,0.22,0.02,0.39,0.37,0.02,0.29,0.28,0.01,3.67,3.48,0.19
2,Gonzales,Marco,594835,2019,866,657,0.26,0.25,0.01,0.42,0.41,0.01,0.31,0.31,0.00,3.99,4.26,-0.27
3,Minor,Mike,501985,2019,863,588,0.24,0.23,0.02,0.40,0.39,0.01,0.30,0.30,0.00,3.59,3.89,-0.30
4,Bieber,Shane,669456,2019,859,554,0.23,0.24,-0.01,0.39,0.42,-0.02,0.28,0.29,-0.01,3.27,3.80,-0.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,Weathers,Ryan,677960,2023,217,161,0.32,0.26,0.06,0.56,0.45,0.11,0.40,0.34,0.06,6.89,4.91,1.98
1116,Gonzales,Marco,594835,2023,215,163,0.28,0.29,-0.00,0.43,0.44,-0.01,0.33,0.35,-0.01,5.22,5.18,0.04
1117,Falter,Bailey,663559,2023,214,170,0.32,0.28,0.03,0.52,0.47,0.05,0.37,0.34,0.03,5.21,4.91,0.30
1118,Liberatore,Matthew,669461,2023,208,155,0.28,0.30,-0.01,0.45,0.51,-0.05,0.35,0.38,-0.03,5.71,6.50,-0.79


In [10]:
# pitcher_data_2023['Name'] = pitcher_data_2023['first_name'] + ' ' + pitcher_data_2023['last_name']

# # Drop the individual 'last_name' and 'first_name' columns
# pitcher_data_2023.drop(columns=['last_name', 'first_name'], inplace=True)

# # Display the modified DataFrame
# pitcher_data_2023

In [11]:
relevant_years = [2019, 2021, 2022]
filtered_data = pitcher_combined_data[pitcher_combined_data['year'].isin(relevant_years)]

pitching_data_2023 = pitcher_combined_data[pitcher_combined_data['year'] == 2023]
# Merge the average wOBA with the 2023 wOBA for each player
grouped_data = filtered_data.groupby(['player_id', 'last_name', 'first_name']).mean().reset_index().rename(columns={'xera': 'avg_zERA'})


In [12]:


# Select relevant columns for zERA_2023 DataFrame
zera_2023 = pitcher_data_2023[['player_id', 'last_name', 'first_name', 'xera']].rename(columns={'xera': 'zERA_2023'})

# Merge player names with the result DataFrame
# Merge player names and calculated results
pitcher_output = pd.merge(
    grouped_data,
    zera_2023,
    on=['player_id', 'last_name', 'first_name']
)

# Display the result
pitcher_output

Unnamed: 0,player_id,last_name,first_name,year,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,est_slg_minus_slg_diff,woba,est_woba,est_woba_minus_woba_diff,era,avg_zERA,era_minus_xera_diff,zERA_2023
0,425794,Wainwright,Adam,2020.67,792.00,571.33,0.25,0.26,-0.01,0.39,0.41,-0.02,0.30,0.32,-0.02,3.65,4.40,-0.74,7.79
1,425844,Greinke,Zack,2020.67,697.33,536.67,0.26,0.26,-0.01,0.40,0.42,-0.01,0.30,0.31,-0.01,3.59,4.20,-0.61,5.39
2,434378,Verlander,Justin,2020.50,756.50,472.00,0.18,0.20,-0.02,0.32,0.34,-0.02,0.23,0.25,-0.02,2.17,2.66,-0.49,3.32
3,446372,Kluber,Corey,2021.50,515.00,370.00,0.26,0.25,0.01,0.40,0.39,0.01,0.31,0.31,0.00,4.08,4.02,0.07,6.18
4,448179,Hill,Rich,2021.50,593.50,408.00,0.25,0.24,0.01,0.42,0.41,0.00,0.32,0.32,0.00,4.06,4.27,-0.20,5.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,677960,Weathers,Ryan,2021.00,401.00,296.00,0.28,0.28,0.00,0.50,0.47,0.02,0.36,0.35,0.01,5.32,5.30,0.02,4.91
145,678394,Bello,Brayan,2022.00,268.00,184.00,0.32,0.25,0.07,0.42,0.35,0.07,0.35,0.30,0.05,4.71,3.80,0.91,4.20
146,680686,Gray,Josiah,2021.50,478.00,308.00,0.24,0.22,0.02,0.50,0.44,0.06,0.35,0.32,0.03,5.26,4.38,0.87,4.63
147,680694,Bradish,Kyle,2022.00,509.00,344.00,0.27,0.25,0.01,0.42,0.40,0.01,0.33,0.33,0.01,4.89,4.49,0.41,4.06


In [13]:
pitcher_output = pitcher_output.drop(columns='year')
pd.set_option('display.float_format', '{:.2f}'.format)

In [14]:
avg_zera = pitcher_output['avg_zERA']
zera_2023 = pitcher_output['zERA_2023']
# 
z_scores_avg_zera = zscore(avg_zera)
z_score_zera_2023 = zscore(zera_2023)

In [15]:
pitcher_output['zscore_difference'] = z_score_zera_2023 - z_scores_avg_zera
# Display the result

pitcher_output['full_name'] = pitcher_output['first_name'].str.cat(pitcher_output['last_name'], sep=' ')
pitcher_output = pitcher_output.drop(columns=['first_name', 'last_name'])

column_order = [
    'player_id', 'full_name', 'pa', 'bip', 'ba', 'est_ba', 'est_ba_minus_ba_diff',
    'slg', 'est_slg', 'est_slg_minus_slg_diff', 'woba', 'est_woba',
    'est_woba_minus_woba_diff', 'era', 'era_minus_xera_diff', 'avg_zERA',
    'zERA_2023', 'zscore_difference'
]

# Reorder the columns
pitcher_output = pitcher_output[column_order]

# Display the result
pitcher_output

Unnamed: 0,player_id,full_name,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,est_slg_minus_slg_diff,woba,est_woba,est_woba_minus_woba_diff,era,era_minus_xera_diff,avg_zERA,zERA_2023,zscore_difference
0,425794,Adam Wainwright,792.00,571.33,0.25,0.26,-0.01,0.39,0.41,-0.02,0.30,0.32,-0.02,3.65,-0.74,4.40,7.79,3.21
1,425844,Zack Greinke,697.33,536.67,0.26,0.26,-0.01,0.40,0.42,-0.01,0.30,0.31,-0.01,3.59,-0.61,4.20,5.39,0.84
2,434378,Justin Verlander,756.50,472.00,0.18,0.20,-0.02,0.32,0.34,-0.02,0.23,0.25,-0.02,2.17,-0.49,2.66,3.32,0.77
3,446372,Corey Kluber,515.00,370.00,0.26,0.25,0.01,0.40,0.39,0.01,0.31,0.31,0.00,4.08,0.07,4.02,6.18,1.98
4,448179,Rich Hill,593.50,408.00,0.25,0.24,0.01,0.42,0.41,0.00,0.32,0.32,0.00,4.06,-0.20,4.27,5.53,0.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,677960,Ryan Weathers,401.00,296.00,0.28,0.28,0.00,0.50,0.47,0.02,0.36,0.35,0.01,5.32,0.02,5.30,4.91,-1.28
145,678394,Brayan Bello,268.00,184.00,0.32,0.25,0.07,0.42,0.35,0.07,0.35,0.30,0.05,4.71,0.91,3.80,4.20,0.10
146,680686,Josiah Gray,478.00,308.00,0.24,0.22,0.02,0.50,0.44,0.06,0.35,0.32,0.03,5.26,0.87,4.38,4.63,-0.26
147,680694,Kyle Bradish,509.00,344.00,0.27,0.25,0.01,0.42,0.40,0.01,0.33,0.33,0.01,4.89,0.41,4.49,4.06,-1.05


In [16]:
 #Write the DataFrame to a CSV file
pitcher_output.to_csv('Resources/pitcher_output_for_leaning.csv', index=False)