In [34]:
from pybaseball import statcast_batter
from pybaseball import playerid_lookup
import pandas as pd
from scipy.stats import zscore
from pybaseball import cache
from pyspark.sql import SparkSession

cache.enable()


In [37]:
from pybaseball import statcast_pitcher_expected_stats

# Get data for all pitchers with a minimum of 150 plate appearances against in  2019, 2021, 2022, 2023
pitcher_data_2019 = statcast_pitcher_expected_stats(2019, 150)
pitcher_data_2021 = statcast_pitcher_expected_stats(2021, 150)
pitcher_data_2022 = statcast_pitcher_expected_stats(2022, 150)
pitcher_data_2023 = statcast_pitcher_expected_stats(2023, 150)

# Concatenate the DataFrames
pitcher_combined_data = pd.concat([pitcher_data_2019, pitcher_data_2021, pitcher_data_2022, pitcher_data_2023], ignore_index=True)

# Display the column names of the combined DataFrame
pitcher_combined_data

Unnamed: 0,last_name,first_name,player_id,year,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,est_slg_minus_slg_diff,woba,est_woba,est_woba_minus_woba_diff,era,xera,era_minus_xera_diff
0,Bauer,Trevor,545333,2019,911,557,0.23,0.22,0.01,0.43,0.40,0.03,0.32,0.31,0.01,4.48,4.23,0.25
1,Lynn,Lance,458681,2019,875,562,0.24,0.22,0.02,0.39,0.37,0.02,0.29,0.28,0.01,3.67,3.48,0.19
2,Gonzales,Marco,594835,2019,866,657,0.26,0.25,0.01,0.42,0.41,0.01,0.31,0.31,0.00,3.99,4.26,-0.27
3,Minor,Mike,501985,2019,863,588,0.24,0.23,0.02,0.40,0.39,0.01,0.30,0.30,0.00,3.59,3.89,-0.30
4,Bieber,Shane,669456,2019,859,554,0.23,0.24,-0.01,0.39,0.42,-0.02,0.28,0.29,-0.01,3.27,3.80,-0.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,Weathers,Ryan,677960,2023,217,161,0.32,0.26,0.06,0.56,0.45,0.11,0.40,0.34,0.06,6.89,4.91,1.98
1116,Gonzales,Marco,594835,2023,215,163,0.28,0.29,-0.00,0.43,0.44,-0.01,0.33,0.35,-0.01,5.22,5.18,0.04
1117,Falter,Bailey,663559,2023,214,170,0.32,0.28,0.03,0.52,0.47,0.05,0.37,0.34,0.03,5.21,4.91,0.30
1118,Liberatore,Matthew,669461,2023,208,155,0.28,0.30,-0.01,0.45,0.51,-0.05,0.35,0.38,-0.03,5.71,6.50,-0.79


In [3]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
pitcher_combined_data.to_csv("Resources/pitcher_combined_data.csv", index=False)

In [4]:
# Filter the DataFrame for the relevant years (2019, 2021, and 2023)
relevant_years = [2019, 2021, 2022]
filtered_data = pitcher_combined_data[pitcher_combined_data['year'].isin(relevant_years)]

# Group the data by player
pitching_data_2023 = pitcher_combined_data[pitcher_combined_data['year'] == 2023]
grouped_data = filtered_data.groupby('player_id')
average_xera = grouped_data['xera'].mean()

# Merge the average wOBA with the 2023 wOBA for each player
xera_2023 = pd.merge(average_xera, pitching_data_2023[['player_id', 'xera']], on='player_id', suffixes=('_avg', '_2023'))

# Merge player names with the result DataFrame
# Merge player names and calculated results
pitcher_output = pd.merge(
    filtered_data[['player_id', 'first_name', 'last_name']].drop_duplicates(),
    xera_2023,
    on='player_id'
)

# Display the result
pitcher_output

Unnamed: 0,player_id,first_name,last_name,xera_avg,xera_2023
0,458681,Lance,Lynn,3.273333,4.75
1,594835,Marco,Gonzales,4.623333,5.18
2,669456,Shane,Bieber,3.680000,4.91
3,593958,Eduardo,Rodriguez,3.806667,3.49
4,605400,Aaron,Nola,3.440000,3.81
...,...,...,...,...,...
144,665795,Edward,Cabrera,4.050000,4.06
145,676664,JP,Sears,4.490000,4.66
146,678394,Brayan,Bello,3.800000,4.20
147,657376,Clarke,Schmidt,3.510000,4.23


In [5]:
# Calculate z-scores for xera_avg and xera_2023
z_scores_xera_avg = zscore(pitcher_output['xera_avg'])
z_score_xera_2023 = zscore(pitcher_output['xera_2023'])

# Create new columns for z-scores and differences
pitcher_output['z_scores_xera_avg'] = z_scores_xera_avg
pitcher_output['z_score_xera_2023'] = z_score_xera_2023

# Calculate the difference between z-scores
pitcher_output['zscore_difference'] = z_score_xera_2023 - z_scores_xera_avg


# Display the result
pitcher_output

Unnamed: 0,player_id,first_name,last_name,xera_avg,xera_2023,z_scores_xera_avg,z_score_xera_2023,zscore_difference
0,458681,Lance,Lynn,3.273333,4.75,-1.125002,0.337984,1.462986
1,594835,Marco,Gonzales,4.623333,5.18,0.817354,0.813179,-0.004174
2,669456,Shane,Bieber,3.680000,4.91,-0.539897,0.514801,1.054698
3,593958,Eduardo,Rodriguez,3.806667,3.49,-0.357652,-1.054448,-0.696797
4,605400,Aaron,Nola,3.440000,3.81,-0.885205,-0.700815,0.184390
...,...,...,...,...,...,...,...,...
144,665795,Edward,Cabrera,4.050000,4.06,-0.007548,-0.424538,-0.416990
145,676664,JP,Sears,4.490000,4.66,0.625516,0.238525,-0.386991
146,678394,Brayan,Bello,3.800000,4.20,-0.367243,-0.269824,0.097420
147,657376,Clarke,Schmidt,3.510000,4.23,-0.784490,-0.236670,0.547820


In [6]:
# Sort the DataFrame by greatest increase in z-score difference
sorted_pitcher_output = pitcher_output.sort_values(by='zscore_difference', ascending=True)

# Display the sorted result
sorted_pitcher_output

Unnamed: 0,player_id,first_name,last_name,xera_avg,xera_2023,z_scores_xera_avg,z_score_xera_2023,zscore_difference
113,669060,Bryse,Wilson,5.575000,4.20,2.186595,-0.269824,-2.456418
60,663567,Peter,Lambert,6.080000,4.95,2.913180,0.559005,-2.354174
123,650633,Michael,King,4.760000,3.27,1.013987,-1.297571,-2.311559
52,621381,Matt,Strahm,4.660000,3.20,0.870109,-1.374929,-2.245038
26,579328,Yusei,Kikuchi,5.410000,4.26,1.949196,-0.203517,-2.152713
...,...,...,...,...,...,...,...,...
121,656629,Michael,Kopech,3.385000,5.64,-0.964338,1.321528,2.285866
107,656731,Tylor,Megill,3.840000,6.38,-0.309692,2.139305,2.448998
100,666201,Alek,Manoah,3.305000,6.18,-1.079441,1.918284,2.997725
22,425794,Adam,Wainwright,4.396667,7.79,0.491230,3.697504,3.206274


In [7]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
sorted_pitcher_output.to_csv("Resources/pitcher_output_for_learning.csv", index=False)

In [8]:
from pybaseball import statcast_batter_expected_stats

# Get data for all qualified batters in 2019, 2021, 2022, 2023
batter_data_2019 = statcast_batter_expected_stats(2019, 150)
batter_data_2021 = statcast_batter_expected_stats(2021, 150)
batter_data_2022 = statcast_batter_expected_stats(2022, 150)
batter_data_2023 = statcast_batter_expected_stats(2023, 150)

# Concatenate the DataFrames
batter_combined_data = pd.concat([batter_data_2019, batter_data_2021, batter_data_2022, batter_data_2023], ignore_index=True)

# Display the column names of the combined DataFrame
batter_combined_data

Unnamed: 0,last_name,first_name,player_id,year,pa,bip,ba,est_ba,est_ba_minus_ba_diff,slg,est_slg,est_slg_minus_slg_diff,woba,est_woba,est_woba_minus_woba_diff
0,Semien,Marcus,543760,2019,747,556,0.285,0.272,0.013,0.522,0.495,0.027,0.373,0.363,0.010
1,Merrifield,Whit,593160,2019,735,559,0.302,0.280,0.022,0.463,0.431,0.032,0.340,0.324,0.016
2,Acuña Jr.,Ronald,660670,2019,715,439,0.280,0.279,0.001,0.518,0.574,-0.056,0.369,0.390,-0.021
3,Villar,Jonathan,542340,2019,714,472,0.274,0.246,0.028,0.453,0.412,0.041,0.335,0.315,0.020
4,Betts,Mookie,605141,2019,706,505,0.295,0.311,-0.016,0.524,0.577,-0.053,0.380,0.411,-0.031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1289,Schmitt,Casey,669477,2023,218,152,0.205,0.229,-0.024,0.290,0.327,-0.037,0.244,0.273,-0.029
1290,Wong,Kolten,543939,2023,216,150,0.165,0.210,-0.045,0.227,0.284,-0.057,0.216,0.260,-0.044
1291,Madrigal,Nick,663611,2023,212,177,0.267,0.284,-0.017,0.346,0.348,-0.002,0.301,0.311,-0.010
1292,Arroyo,Christian,624414,2023,206,153,0.241,0.231,0.010,0.369,0.341,0.028,0.275,0.269,0.006


In [9]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
batter_combined_data.to_csv("Resources/batter_combined_data.csv", index=False)

In [10]:
# Filter the DataFrame for the relevant years (2019, 2021, 2022)
relevant_years = [2019, 2021, 2022]
filtered_data = batter_combined_data[batter_combined_data['year'].isin(relevant_years)]

# Filter the 2023 data
batter_data_2023 = batter_combined_data[batter_combined_data['year'] == 2023]

# Group the data by player and calculate the average wOBA for relevant years
grouped_data = filtered_data.groupby('player_id')
average_woba = grouped_data['woba'].mean()

# Merge the average wOBA with the 2023 wOBA for each player
woba_2023 = pd.merge(average_woba, batter_data_2023[['player_id', 'woba']], on='player_id', suffixes=('_avg', '_2023'))

# Merge player names with the result DataFrame
# Merge player names and calculated results
players_off_output = pd.merge(
    filtered_data[['player_id', 'first_name', 'last_name']].drop_duplicates(),
    woba_2023,
    on='player_id'
)

# Display the result
players_off_output

Unnamed: 0,player_id,first_name,last_name,woba_avg,woba_2023
0,543760,Marcus,Semien,0.352667,0.354
1,593160,Whit,Merrifield,0.313333,0.338
2,660670,Ronald,Acuña Jr.,0.372000,0.424
3,605141,Mookie,Betts,0.372667,0.406
4,645277,Ozzie,Albies,0.331667,0.354
...,...,...,...,...,...
233,670770,TJ,Friedl,0.323000,0.340
234,641584,Jake,Fraley,0.352000,0.349
235,663743,Nick,Fortes,0.308000,0.258
236,608841,Joey,Meneses,0.395000,0.322


In [11]:
players_off_output['full_name'] = players_off_output['first_name'].str.cat(players_off_output['last_name'], sep=' ')
players_off_output = players_off_output.drop(columns=['first_name', 'last_name'])

players_off_output.head()

Unnamed: 0,player_id,woba_avg,woba_2023,full_name
0,543760,0.352667,0.354,Marcus Semien
1,593160,0.313333,0.338,Whit Merrifield
2,660670,0.372,0.424,Ronald Acuña Jr.
3,605141,0.372667,0.406,Mookie Betts
4,645277,0.331667,0.354,Ozzie Albies


In [12]:
# Rename the 'full_name' column to 'Name'
players_off_output = players_off_output.rename(columns={'full_name': 'Name'})

# Reorder the columns
players_off_output = players_off_output[['player_id', 'Name', 'woba_avg', 'woba_2023']]
players_off_output.head()

Unnamed: 0,player_id,Name,woba_avg,woba_2023
0,543760,Marcus Semien,0.352667,0.354
1,593160,Whit Merrifield,0.313333,0.338
2,660670,Ronald Acuña Jr.,0.372,0.424
3,605141,Mookie Betts,0.372667,0.406
4,645277,Ozzie Albies,0.331667,0.354


In [13]:
z_scores_woba_avg = zscore(average_woba)
z_score_woba_2023 = zscore(woba_2023)


In [14]:
# Calculate z-scores for woba_avg and woba_2023
players_off_output['zscore_woba_avg'] = zscore(players_off_output['woba_avg'])
players_off_output['zscore_woba_2023'] = zscore(players_off_output['woba_2023'])

# Calculate the difference between z-scores
players_off_output['zscore_difference'] = players_off_output['zscore_woba_2023'] - players_off_output['zscore_woba_avg']

# Display the result
players_off_output

Unnamed: 0,player_id,Name,woba_avg,woba_2023,zscore_woba_avg,zscore_woba_2023,zscore_difference
0,543760,Marcus Semien,0.352667,0.354,0.780199,0.799043,0.018843
1,593160,Whit Merrifield,0.313333,0.338,-0.484402,0.369942,0.854344
2,660670,Ronald Acuña Jr.,0.372000,0.424,1.401783,2.676359,1.274576
3,605141,Mookie Betts,0.372667,0.406,1.423217,2.193621,0.770404
4,645277,Ozzie Albies,0.331667,0.354,0.105031,0.799043,0.694012
...,...,...,...,...,...,...,...
233,670770,TJ Friedl,0.323000,0.340,-0.173610,0.423579,0.597190
234,641584,Jake Fraley,0.352000,0.349,0.758765,0.664949,-0.093817
235,663743,Nick Fortes,0.308000,0.258,-0.655874,-1.775563,-1.119689
236,608841,Joey Meneses,0.395000,0.322,2.141254,-0.059159,-2.200413


In [15]:
# Sort the DataFrame by greatest increase in z-score difference
sorted_output_batting_woba = players_off_output.sort_values(by='zscore_difference', ascending=False)

# Display the sorted result
sorted_output_batting_woba

Unnamed: 0,player_id,Name,woba_avg,woba_2023,zscore_woba_avg,zscore_woba_2023,zscore_difference
198,672695,Geraldo Perdomo,0.253000,0.340,-2.424172,0.423579,2.847752
115,656811,Ryan O'Hearn,0.276000,0.353,-1.684702,0.772224,2.456926
23,641355,Cody Bellinger,0.311667,0.393,-0.537987,1.844976,2.382963
73,608369,Corey Seager,0.353333,0.438,0.801633,3.051823,2.250189
177,673490,Ha-Seong Kim,0.291500,0.360,-1.186363,0.959956,2.146319
...,...,...,...,...,...,...,...
168,606992,Eric Haase,0.319000,0.233,-0.302214,-2.446033,-2.143819
53,516782,Starling Marte,0.357333,0.278,0.930237,-1.239187,-2.169424
236,608841,Joey Meneses,0.395000,0.322,2.141254,-0.059159,-2.200413
79,641313,Tim Anderson,0.343667,0.256,0.490841,-1.829201,-2.320042


In [16]:
# Save the concatenated DataFrame as a CSV file in the "resources" folder
sorted_output_batting_woba.to_csv("Resources/sorted_output_batting_woba.csv", index=False)

In [17]:
from pybaseball import batting_stats

# retrieve data on only players who have 50+ plate appearances this year
full_batting_2019 = batting_stats(2019, qual=150)
full_batting_2021 = batting_stats(2021, qual=150)
full_batting_2022 = batting_stats(2022, qual=150)
full_batting_2023 = batting_stats(2023, qual=150)

# Concatenate the DataFrames vertically
full_batting_data_pre_changes = pd.concat([full_batting_2019,full_batting_2021, full_batting_2022], ignore_index=True)

full_batting_data_pre_changes.head()

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,10155,2019,Mike Trout,LAA,27,134,470,600,137,63,...,116.6,155,0.438,354,0.193,0.258,0.31,0.678,0.46,8.4
1,17678,2019,Alex Bregman,HOU,25,156,554,690,164,84,...,107.4,178,0.372,479,0.204,0.25,0.263,0.453,0.373,8.4
2,15998,2019,Cody Bellinger,LAD,23,156,558,660,170,86,...,110.6,207,0.456,454,0.149,0.246,0.319,0.635,0.43,7.7
3,11477,2019,Christian Yelich,MIL,27,130,489,580,161,85,...,117.9,182,0.487,374,0.14,0.258,0.314,0.629,0.429,7.8
4,12861,2019,Anthony Rendon,WSN,29,146,545,646,174,93,...,107.7,218,0.466,468,0.172,0.223,0.31,0.59,0.414,7.0


In [18]:
print("Shape of the DataFrame:", full_batting_data_pre_changes.shape)
print("Column names:", full_batting_data_pre_changes.columns.tolist())

Shape of the DataFrame: (1239, 320)
Column names: ['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG', 'GB', 'FB', 'LD', 'IFFB', 'Pitches', 'Balls', 'Strikes', 'IFH', 'BU', 'BUH', 'BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'Fld', 'Rep', 'Pos', 'RAR', 'WAR', 'Dol', 'Spd', 'wRC+', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'phLI', 'PH', 'WPA/LI', 'Clutch', 'FB% (Pitch)', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'BsR', 'FA% (sc)', 'FT% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL%

In [32]:
# Group the DataFrame by 'IDfg' and calculate the average for each group
grouped_full_batting_data = full_batting_data_pre_changes.groupby(['IDfg', 'Name']).mean().reset_index()

grouped_full_batting_data = grouped_full_batting_data.drop(columns='Season')
pd.set_option('display.float_format', '{:.2f}'.format)

# Display the resulting grouped and averaged DataFrame
grouped_full_batting_data

  grouped_full_batting_data = full_batting_data_pre_changes.groupby(['IDfg', 'Name']).mean().reset_index()


Unnamed: 0,IDfg,Name,Age,G,AB,PA,H,1B,2B,3B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,785,Todd Frazier,33.00,133.00,447.00,499.00,112.00,70.00,19.00,2.00,...,108.40,129.00,0.38,341.00,0.18,0.28,0.23,0.41,0.31,1.90
1,1177,Albert Pujols,40.67,116.33,357.67,397.33,89.33,55.00,13.00,0.00,...,112.30,122.67,0.41,307.00,0.18,0.27,0.26,0.46,0.34,0.37
2,1433,Wilson Ramos,32.00,92.50,312.00,343.50,83.50,60.50,12.00,0.00,...,111.25,112.00,0.43,262.50,0.13,0.25,0.25,0.41,0.31,0.55
3,1744,Miguel Cabrera,37.67,126.00,454.00,502.67,120.33,94.00,15.67,0.00,...,111.67,155.33,0.44,351.33,0.15,0.27,0.25,0.40,0.31,-0.77
4,2136,David Peralta,32.67,127.67,436.00,483.67,113.67,68.67,29.67,4.67,...,113.33,144.00,0.42,341.33,0.16,0.27,0.25,0.37,0.30,1.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630,27465,Spencer Torkelson,22.00,110.00,360.00,404.00,73.00,48.00,16.00,1.00,...,111.50,109.00,0.41,263.00,0.17,0.28,0.23,0.38,0.30,-0.90
631,27506,Ha-seong Kim,25.50,133.50,392.00,440.00,92.00,59.50,20.50,2.50,...,110.85,100.50,0.32,312.50,0.21,0.29,0.23,0.34,0.29,2.30
632,27676,Vinnie Pasquantino,24.00,72.00,258.00,298.00,76.00,56.00,10.00,0.00,...,112.70,106.00,0.47,226.00,0.16,0.23,0.29,0.47,0.37,1.50
633,27684,Michael Massey,24.00,52.00,173.00,194.00,42.00,28.00,9.00,1.00,...,108.80,49.00,0.37,131.00,0.13,0.27,0.24,0.45,0.33,0.80


In [20]:
# Select specific columns
selected_columns = ['player_id', 'Name', 'zscore_woba_2023', 'zscore_difference']

# Create a new DataFrame with the selected columns
filtered_sorted_output_batting_woba = sorted_output_batting_woba[selected_columns]
filtered_sorted_output_batting_woba.head()

Unnamed: 0,player_id,Name,zscore_woba_2023,zscore_difference
198,672695,Geraldo Perdomo,0.4236,2.8478
115,656811,Ryan O'Hearn,0.7722,2.4569
23,641355,Cody Bellinger,1.845,2.383
73,608369,Corey Seager,3.0518,2.2502
177,673490,Ha-Seong Kim,0.96,2.1463


In [21]:
grouped_full_batting_data['Name'] = grouped_full_batting_data['Name'].str.lower()
filtered_sorted_output_batting_woba['Name'] = filtered_sorted_output_batting_woba['Name'].str.lower()
full_batting_2023['Name'] = full_batting_2023['Name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_sorted_output_batting_woba['Name'] = filtered_sorted_output_batting_woba['Name'].str.lower()


In [22]:
# Create a mask for names that are present in full_batting_2023
mask = grouped_full_batting_data['Name'].isin(full_batting_2023['Name'])

# Filter the grouped_full_batting_data DataFrame using the mask
filtered_grouped_batting_data = grouped_full_batting_data[mask]

filtered_grouped_batting_data.shape

(304, 316)

In [25]:
# Create the 'full_name' column by concatenating 'first_name' and 'last_name'
batter_data_2023['full_name'] = batter_data_2023['first_name'].str.cat(batter_data_2023['last_name'], sep=' ')
wboa_data_df = batter_data_2023.drop(columns=['first_name', 'last_name'])

# Convert 'Name' columns to lowercase
batter_data_2023['full_name'] = batter_data_2023['full_name'].str.lower()

# Create a mask for names that are present in grouped_full_batting_data
mask = filtered_sorted_output_batting_woba['Name'].isin(batter_data_2023['full_name'])

# Filter the DataFrame using the mask
wboa_data_df = filtered_sorted_output_batting_woba[mask]

# Display the filtered DataFrame
wboa_data_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batter_data_2023['full_name'] = batter_data_2023['first_name'].str.cat(batter_data_2023['last_name'], sep=' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batter_data_2023['full_name'] = batter_data_2023['full_name'].str.lower()


Unnamed: 0,player_id,Name,zscore_woba_2023,zscore_difference
198,672695,geraldo perdomo,0.4236,2.8478
115,656811,ryan o'hearn,0.7722,2.4569
23,641355,cody bellinger,1.8450,2.3830
73,608369,corey seager,3.0518,2.2502
177,673490,ha-seong kim,0.9600,2.1463
...,...,...,...,...
168,606992,eric haase,-2.4460,-2.1438
53,516782,starling marte,-1.2392,-2.1694
236,608841,joey meneses,-0.0592,-2.2004
79,641313,tim anderson,-1.8292,-2.3200


In [26]:
wboa_data_df['Standardized_Name'] = wboa_data_df['Name'].str.replace(r'\W', '').str.lower()
filtered_grouped_batting_data['Standardized_Name'] = filtered_grouped_batting_data['Name'].str.replace(r'\W', '').str.lower()

# Drop the original 'Name' columns
wboa_data_df.drop(columns=['Name'], inplace=True)
filtered_grouped_batting_data.drop(columns=['Name'], inplace=True)

  wboa_data_df['Standardized_Name'] = wboa_data_df['Name'].str.replace(r'\W', '').str.lower()
  filtered_grouped_batting_data['Standardized_Name'] = filtered_grouped_batting_data['Name'].str.replace(r'\W', '').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_grouped_batting_data['Standardized_Name'] = filtered_grouped_batting_data['Name'].str.replace(r'\W', '').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_grouped_batting_data.drop(columns=['Name'], inplace=True)


In [28]:
full_batting_df = pd.merge(filtered_grouped_batting_data, wboa_data_df, on='Standardized_Name', how='inner')

# Set the 'Standardized_Name' column as the index
full_batting_df.set_index('Standardized_Name', inplace=True)

# Display the merged DataFrame
full_batting_df

Unnamed: 0_level_0,IDfg,Age,G,AB,PA,H,1B,2B,3B,HR,...,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR,player_id,zscore_woba_2023,zscore_difference
Standardized_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
miguelcabrera,1744,37.6667,126.0000,454.0000,502.6667,120.3333,94.0000,15.6667,0.0000,10.6667,...,351.3333,0.1527,0.2680,0.2460,0.3957,0.3070,-0.7667,408234,-0.8369,0.0976
davidperalta,2136,32.6667,127.6667,436.0000,483.6667,113.6667,68.6667,29.6667,4.6667,10.6667,...,341.3333,0.1570,0.2663,0.2460,0.3743,0.3033,1.2667,444482,-0.3542,-0.1591
carlossantana,2396,34.6667,149.0000,523.0000,617.0000,123.0000,77.6667,21.0000,0.3333,24.0000,...,426.6667,0.1663,0.2490,0.2543,0.4463,0.3513,1.7667,467793,-0.4614,-0.4272
tommypham,2967,32.6667,148.0000,532.0000,612.3333,131.6667,85.6667,26.6667,1.6667,17.6667,...,396.0000,0.2043,0.2890,0.2537,0.4253,0.3383,1.7333,502054,0.5040,0.6562
anthonyrizzo,3473,30.6667,139.0000,491.0000,579.0000,125.6667,72.0000,24.3333,2.3333,27.0000,...,403.0000,0.1687,0.2570,0.2657,0.4583,0.3613,2.7333,519203,-0.3005,-1.3272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
brysonstott,26294,24.0000,127.0000,427.0000,466.0000,100.0000,69.0000,19.0000,2.0000,10.0000,...,340.0000,0.2040,0.2700,0.2370,0.3400,0.2840,1.8000,681082,0.3163,1.5830
spencertorkelson,27465,22.0000,110.0000,360.0000,404.0000,73.0000,48.0000,16.0000,1.0000,8.0000,...,263.0000,0.1700,0.2750,0.2260,0.3780,0.3050,-0.9000,679529,-0.3273,1.4860
haseongkim,27506,25.5000,133.5000,392.0000,440.0000,92.0000,59.5000,20.5000,2.5000,9.5000,...,312.5000,0.2105,0.2895,0.2295,0.3385,0.2870,2.3000,673490,0.9600,2.1463
vinniepasquantino,27676,24.0000,72.0000,258.0000,298.0000,76.0000,56.0000,10.0000,0.0000,10.0000,...,226.0000,0.1620,0.2280,0.2890,0.4750,0.3740,1.5000,686469,0.0749,-1.1018


In [31]:
full_batting_df.to_csv('Resources/full_batting_for_learning.csv', index=False)