In [22]:
from pybaseball import statcast_batter
from pybaseball import playerid_lookup
import pandas as pd
from scipy.stats import zscore
from pybaseball import cache
from pyspark.sql import SparkSession

cache.enable()


In [8]:
# Sample list of player names
player_names = ['david ortiz', 'mike trout', 'bryce harper']  # Add more player names as needed

# Initialize an empty DataFrame to store the OPS data
ops_data = pd.DataFrame(columns=['Player', 'Year', 'OPS'])

# Loop through each player name
for player_name in player_names:
    # Lookup player ID using playerid_lookup
    player_info = playerid_lookup(player_name.split()[1], player_name.split()[0])
    
    if not player_info.empty:
        player_id = player_info['key_mlbam'].iloc[0]
        
        # Fetch player data for the desired date range
        player_data = statcast_batter(start_dt='2008-04-01', end_dt='2023-08-13', player_id=player_id)
        
        player_data['game_date'] = pd.to_datetime(player_data['game_date'])
        # Calculate OPS for each year
        yearly_ops = player_data.groupby(player_data['game_date'].dt.year)['woba_value'].mean()
        
        # Append OPS data to the DataFrame
        ops_data = ops_data.append(pd.DataFrame({'Player': [player_name] * len(yearly_ops),
                                                 'Year': yearly_ops.index,
                                                 'OPS': yearly_ops}), ignore_index=True)
    else:
        print(f"Player {player_name} not found.")

# Display the collected OPS data
ops_data

Gathering Player Data


  ops_data = ops_data.append(pd.DataFrame({'Player': [player_name] * len(yearly_ops),


Gathering Player Data


  df = pd.read_csv(io.StringIO(data.text))
  ops_data = ops_data.append(pd.DataFrame({'Player': [player_name] * len(yearly_ops),


Gathering Player Data


  ops_data = ops_data.append(pd.DataFrame({'Player': [player_name] * len(yearly_ops),


Unnamed: 0,Player,Year,OPS
0,david ortiz,2008,0.361589
1,david ortiz,2009,0.342547
2,david ortiz,2010,0.363279
3,david ortiz,2011,0.392039
4,david ortiz,2012,0.398198
5,david ortiz,2013,0.395434
6,david ortiz,2014,0.338941
7,david ortiz,2015,0.371254
8,david ortiz,2016,0.397737
9,mike trout,2010,0.871429


In [2]:
from pybaseball import statcast_pitcher_expected_stats

# Get data for all pitchers with a minimum of 150 plate appearances against in  2019, 2021, 2022, 2023
pitcher_data_2019 = statcast_pitcher_expected_stats(2019, 150)
pitcher_data_2021 = statcast_pitcher_expected_stats(2021, 150)
pitcher_data_2022 = statcast_pitcher_expected_stats(2022, 150)
pitcher_data_2023 = statcast_pitcher_expected_stats(2023, 150)

# Concatenate the DataFrames
pitcher_combined_data = pd.concat([pitcher_data_2019, pitcher_data_2021, pitcher_data_2022, pitcher_data_2023], ignore_index=True)

# Display the column names of the combined DataFrame
pitcher_combined_data

Unnamed: 0,last_name,first_name,player_id,year,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,est_slg_minus_slg_diff,woba,est_woba,est_woba_minus_woba_diff,era,xera,era_minus_xera_diff
0,Bauer,Trevor,545333,2019,911,557,0.230,0.223,0.007,0.429,0.402,0.027,0.316,0.310,0.006,4.48,4.23,0.249
1,Lynn,Lance,458681,2019,875,562,0.243,0.222,0.021,0.390,0.372,0.018,0.294,0.283,0.011,3.67,3.48,0.192
2,Gonzales,Marco,594835,2019,866,657,0.264,0.254,0.010,0.422,0.407,0.015,0.311,0.311,0.000,3.99,4.26,-0.270
3,Minor,Mike,501985,2019,863,588,0.244,0.228,0.016,0.395,0.389,0.006,0.301,0.298,0.003,3.59,3.89,-0.304
4,Bieber,Shane,669456,2019,859,554,0.230,0.236,-0.006,0.393,0.418,-0.025,0.280,0.295,-0.015,3.27,3.80,-0.525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,Weathers,Ryan,677960,2023,217,161,0.318,0.258,0.060,0.563,0.449,0.114,0.401,0.340,0.061,6.89,4.91,1.980
1116,Gonzales,Marco,594835,2023,215,163,0.282,0.286,-0.004,0.431,0.441,-0.010,0.334,0.348,-0.014,5.22,5.18,0.040
1117,Falter,Bailey,663559,2023,214,170,0.317,0.283,0.034,0.515,0.469,0.046,0.369,0.340,0.029,5.21,4.91,0.304
1118,Liberatore,Matthew,669461,2023,208,155,0.282,0.297,-0.015,0.453,0.508,-0.055,0.349,0.383,-0.034,5.71,6.50,-0.785


In [4]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
pitcher_combined_data.to_csv("Resources/pitcher_combined_data.csv", index=False)

In [8]:
# Filter the DataFrame for the relevant years (2019, 2021, and 2023)
relevant_years = [2019, 2021, 2022, 2023]
filtered_data = pitcher_combined_data[pitcher_combined_data['year'].isin(relevant_years)]

# Group the data by player
grouped_data = filtered_data.groupby('player_id')

# Calculate the average wOBA for each player
average_xera = grouped_data['xera'].mean()

# Merge the average wOBA with the 2023 wOBA for each player
xera_2023 = pd.merge(average_xera, filtered_data[filtered_data['year'] == 2023][['player_id', 'xera']], on='player_id', suffixes=('_avg', '_2023'))

# Merge player names with the result DataFrame
# Merge player names and calculated results
pitcher_output = pd.merge(
    filtered_data[['player_id', 'first_name', 'last_name']].drop_duplicates(),
    average_xera,
    left_on='player_id',
    right_index=True
).merge(
    xera_2023,
    on='player_id',
    suffixes=('_avg', '_2023')
)

# Display the result
pitcher_output

Unnamed: 0,player_id,first_name,last_name,xera,xera_avg,xera_2023
0,458681,Lance,Lynn,3.6425,3.6425,4.75
1,594835,Marco,Gonzales,4.7625,4.7625,5.18
2,669456,Shane,Bieber,3.9875,3.9875,4.91
3,593958,Eduardo,Rodriguez,3.7275,3.7275,3.49
4,605400,Aaron,Nola,3.5325,3.5325,3.81
...,...,...,...,...,...,...
195,663947,Tyler,Holton,3.3400,3.3400,3.34
196,666974,Yennier,Cano,3.2000,3.2000,3.20
197,687330,Kevin,Kelly,3.3200,3.3200,3.32
198,669461,Matthew,Liberatore,6.5000,6.5000,6.50


In [23]:
# Calculate z-scores for xera_avg and xera_2023
z_scores_xera_avg = zscore(pitcher_output['xera_avg'])
z_score_xera_2023 = zscore(pitcher_output['xera_2023'])

# Create new columns for z-scores and differences
pitcher_output['z_scores_xera_avg'] = z_scores_xera_avg
pitcher_output['z_score_xera_2023'] = z_score_xera_2023

# Calculate the difference between z-scores
pitcher_output['zscore_difference'] = z_score_xera_2023 - z_scores_xera_avg


# Display the result
pitcher_output

Unnamed: 0,player_id,first_name,last_name,xera,xera_avg,xera_2023,z_scores_xera_avg,z_score_xera_2023,zscore_difference
0,458681,Lance,Lynn,3.6425,3.6425,4.75,-0.806807,0.282848,1.089655
1,594835,Marco,Gonzales,4.7625,4.7625,5.18,0.596862,0.731316,0.134453
2,669456,Shane,Bieber,3.9875,3.9875,4.91,-0.374427,0.449720,0.824146
3,593958,Eduardo,Rodriguez,3.7275,3.7275,3.49,-0.700279,-1.031268,-0.330989
4,605400,Aaron,Nola,3.5325,3.5325,3.81,-0.944668,-0.697524,0.247143
...,...,...,...,...,...,...,...,...,...
195,663947,Tyler,Holton,3.3400,3.3400,3.34,-1.185923,-1.187710,-0.001787
196,666974,Yennier,Cano,3.2000,3.2000,3.20,-1.361382,-1.333723,0.027659
197,687330,Kevin,Kelly,3.3200,3.3200,3.32,-1.210989,-1.208569,0.002420
198,669461,Matthew,Liberatore,6.5000,6.5000,6.50,2.774430,2.108008,-0.666422


In [25]:
# Sort the DataFrame by greatest increase in z-score difference
sorted_pitcher_output = pitcher_output.sort_values(by='zscore_difference', ascending=True)

# Display the sorted result
sorted_pitcher_output

Unnamed: 0,player_id,first_name,last_name,xera,xera_avg,xera_2023,z_scores_xera_avg,z_score_xera_2023,zscore_difference
113,669060,Bryse,Wilson,5.116667,5.116667,4.20,1.040731,-0.290774,-1.331505
26,579328,Yusei,Kikuchi,5.122500,5.122500,4.26,1.048042,-0.228197,-1.276239
48,608379,Michael,Wacha,5.010000,5.010000,4.20,0.907048,-0.290774,-1.197823
60,663567,Peter,Lambert,5.515000,5.515000,4.95,1.539953,0.491437,-1.048516
180,666205,Kyle,Muller,8.240000,8.240000,8.24,4.955132,3.922739,-1.032393
...,...,...,...,...,...,...,...,...,...
73,543521,Collin,McHugh,3.612500,3.612500,4.88,-0.844405,0.418431,1.262837
121,656629,Michael,Kopech,4.136667,4.136667,5.64,-0.187480,1.211072,1.398552
138,622663,Luis,Severino,4.845000,4.845000,6.75,0.700258,2.368745,1.668487
100,666201,Alek,Manoah,4.263333,4.263333,6.18,-0.028731,1.774264,1.802996


In [26]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
sorted_pitcher_output.to_csv("Resources/sorted_pitcher_output.csv", index=False)

In [10]:
from pybaseball import statcast_batter_expected_stats

# Get data for all qualified batters in 2019, 2021, 2022, 2023
batter_data_2019 = statcast_batter_expected_stats(2019, 150)
batter_data_2021 = statcast_batter_expected_stats(2021, 150)
batter_data_2022 = statcast_batter_expected_stats(2022, 150)
batter_data_2023 = statcast_batter_expected_stats(2023, 150)

# Concatenate the DataFrames
batter_combined_data = pd.concat([batter_data_2019, batter_data_2021, batter_data_2022, batter_data_2023], ignore_index=True)

# Display the column names of the combined DataFrame
batter_combined_data

Unnamed: 0,last_name,first_name,player_id,year,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,est_slg_minus_slg_diff,woba,est_woba,est_woba_minus_woba_diff
0,Semien,Marcus,543760,2019,747,556,0.285,0.272,0.013,0.522,0.495,0.027,0.373,0.363,0.010
1,Merrifield,Whit,593160,2019,735,559,0.302,0.280,0.022,0.463,0.431,0.032,0.340,0.324,0.016
2,Acuña Jr.,Ronald,660670,2019,715,439,0.280,0.279,0.001,0.518,0.574,-0.056,0.369,0.390,-0.021
3,Villar,Jonathan,542340,2019,714,472,0.274,0.246,0.028,0.453,0.412,0.041,0.335,0.315,0.020
4,Betts,Mookie,605141,2019,706,505,0.295,0.311,-0.016,0.524,0.577,-0.053,0.380,0.411,-0.031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1289,Schmitt,Casey,669477,2023,218,152,0.205,0.229,-0.024,0.290,0.327,-0.037,0.244,0.273,-0.029
1290,Wong,Kolten,543939,2023,216,150,0.165,0.210,-0.045,0.227,0.284,-0.057,0.216,0.260,-0.044
1291,Madrigal,Nick,663611,2023,212,177,0.267,0.284,-0.017,0.346,0.348,-0.002,0.301,0.311,-0.010
1292,Arroyo,Christian,624414,2023,206,153,0.241,0.231,0.010,0.369,0.341,0.028,0.275,0.269,0.006


In [13]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
batter_combined_data.to_csv("Resources/batter_combined_data.csv", index=False)

In [14]:
# Filter the DataFrame for the relevant years (2019, 2021, and 2023)
relevant_years = [2019, 2021, 2022, 2023]
filtered_data = batter_combined_data[batter_combined_data['year'].isin(relevant_years)]

# Group the data by player
grouped_data = filtered_data.groupby('player_id')

# Calculate the average wOBA for each player
average_woba = grouped_data['woba'].mean()

# Merge the average wOBA with the 2023 wOBA for each player
woba_2023 = pd.merge(average_woba, filtered_data[filtered_data['year'] == 2023][['player_id', 'woba']], on='player_id', suffixes=('_avg', '_2023'))

# Merge player names with the result DataFrame
# Merge player names and calculated results
players_off_output = pd.merge(
    filtered_data[['player_id', 'first_name', 'last_name']].drop_duplicates(),
    average_woba,
    left_on='player_id',
    right_index=True
).merge(
    woba_2023,
    on='player_id',
    suffixes=('_avg', '_2023')
)

# Display the result
players_off_output


Unnamed: 0,player_id,first_name,last_name,woba,woba_avg,woba_2023
0,543760,Marcus,Semien,0.35300,0.35300,0.354
1,593160,Whit,Merrifield,0.31950,0.31950,0.338
2,660670,Ronald,Acuña Jr.,0.38500,0.38500,0.424
3,605141,Mookie,Betts,0.38100,0.38100,0.406
4,645277,Ozzie,Albies,0.33725,0.33725,0.354
...,...,...,...,...,...,...
285,678225,Ji Hwan,Bae,0.27300,0.27300,0.273
286,665828,Oswaldo,Cabrera,0.25500,0.25500,0.255
287,608671,Travis,Jankowski,0.34200,0.34200,0.342
288,672779,Tucupita,Marcano,0.27500,0.27500,0.275


In [15]:
z_scores_woba_avg = zscore(average_woba)
z_score_woba_2023 = zscore(woba_2023)


In [17]:
# Calculate z-scores for woba_avg and woba_2023
players_off_output['zscore_avg_woba'] = zscore(players_off_output['woba_avg'])
players_off_output['zscore_woba_2023'] = zscore(players_off_output['woba_2023'])

# Calculate the difference between z-scores
players_off_output['zscore_difference'] = players_off_output['zscore_woba_2023'] - players_off_output['zscore_avg_woba']

# Display the result
players_off_output

Unnamed: 0,player_id,first_name,last_name,woba,woba_avg,woba_2023,zscore_avg_woba,zscore_woba_2023,zscore_difference
0,543760,Marcus,Semien,0.35300,0.35300,0.354,0.952258,0.864183,-0.088075
1,593160,Whit,Merrifield,0.31950,0.31950,0.338,-0.184251,0.430080,0.614331
2,660670,Ronald,Acuña Jr.,0.38500,0.38500,0.424,2.037879,2.763382,0.725504
3,605141,Mookie,Betts,0.38100,0.38100,0.406,1.902176,2.275017,0.372841
4,645277,Ozzie,Albies,0.33725,0.33725,0.354,0.417929,0.864183,0.446254
...,...,...,...,...,...,...,...,...,...
285,678225,Ji Hwan,Bae,0.27300,0.27300,0.273,-1.761794,-1.333463,0.428331
286,665828,Oswaldo,Cabrera,0.25500,0.25500,0.255,-2.372456,-1.821828,0.550627
287,608671,Travis,Jankowski,0.34200,0.34200,0.342,0.579076,0.538606,-0.040470
288,672779,Tucupita,Marcano,0.27500,0.27500,0.275,-1.693943,-1.279200,0.414743


In [18]:
# Sort the DataFrame by greatest increase in z-score difference
sorted_output_batting_woba = players_off_output.sort_values(by='zscore_difference', ascending=False)

# Display the sorted result
sorted_output_batting_woba

Unnamed: 0,player_id,first_name,last_name,woba,woba_avg,woba_2023,zscore_avg_woba,zscore_woba_2023,zscore_difference
23,641355,Cody,Bellinger,0.332000,0.332000,0.393,0.239819,1.922308,1.682489
115,656811,Ryan,O'Hearn,0.301667,0.301667,0.353,-0.789259,0.837051,1.626310
73,608369,Corey,Seager,0.374500,0.374500,0.438,1.681659,3.143222,1.461563
198,672695,Geraldo,Perdomo,0.296500,0.296500,0.340,-0.964541,0.484343,1.448884
110,600869,Jeimer,Candelario,0.317250,0.317250,0.365,-0.260584,1.162628,1.423212
...,...,...,...,...,...,...,...,...,...
8,547989,José,Abreu,0.334500,0.334500,0.279,0.324633,-1.170674,-1.495308
47,545361,Mike,Trout,0.408000,0.408000,0.370,2.818169,1.298285,-1.519883
53,516782,Starling,Marte,0.337500,0.337500,0.278,0.426410,-1.197806,-1.624216
79,641313,Tim,Anderson,0.321750,0.321750,0.256,-0.107919,-1.794697,-1.686778


In [75]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
sorted_output_batting_woba.to_csv("Resources/sorted_output_batting_woba.csv", index=False)