In [1]:
# Import our dependencies
import numpy as np
import pandas as pd
from unidecode import unidecode

In [2]:
#  Import and read the charity_data.csv.
running_df = pd.read_csv("Resources/running_splits.csv")

running_df.columns

Index(['last_name', ' first_name', 'player_id', 'name_abbrev', 'team_id',
       'position_name', 'age', 'bat_side', 'seconds_since_hit_000',
       'seconds_since_hit_005', 'seconds_since_hit_010',
       'seconds_since_hit_015', 'seconds_since_hit_020',
       'seconds_since_hit_025', 'seconds_since_hit_030',
       'seconds_since_hit_035', 'seconds_since_hit_040',
       'seconds_since_hit_045', 'seconds_since_hit_050',
       'seconds_since_hit_055', 'seconds_since_hit_060',
       'seconds_since_hit_065', 'seconds_since_hit_070',
       'seconds_since_hit_075', 'seconds_since_hit_080',
       'seconds_since_hit_085', 'seconds_since_hit_090'],
      dtype='object')

In [3]:
# Combine "last_name" and "first_name" into a new column "Name"
running_df['Name'] = running_df[' first_name'] + ' ' + running_df['last_name']

# Drop the "first_name" and "last_name" columns
running_df = running_df.drop(columns=[' first_name', 'last_name'])

# Print the modified DataFrame
running_df.head()

Unnamed: 0,player_id,name_abbrev,team_id,position_name,age,bat_side,seconds_since_hit_000,seconds_since_hit_005,seconds_since_hit_010,seconds_since_hit_015,...,seconds_since_hit_050,seconds_since_hit_055,seconds_since_hit_060,seconds_since_hit_065,seconds_since_hit_070,seconds_since_hit_075,seconds_since_hit_080,seconds_since_hit_085,seconds_since_hit_090,Name
0,682928,WSH,120,SS,22,L,0.0,0.53,0.82,1.07,...,2.46,2.63,2.8,2.97,3.14,3.31,3.48,3.66,3.85,CJ Abrams
1,547989,HOU,117,1B,36,R,0.0,0.58,0.93,1.22,...,2.75,2.95,3.14,3.33,3.51,3.7,3.89,4.09,4.29,José Abreu
2,660670,ATL,144,RF,25,R,0.0,0.55,0.85,1.11,...,2.5,2.67,2.84,3.01,3.18,3.35,3.52,3.69,3.87,Ronald Acuña Jr.
3,642715,MIL,158,SS,27,R,0.0,0.57,0.89,1.15,...,2.6,2.78,2.96,3.14,3.31,3.49,3.66,3.84,4.02,Willy Adames
4,656180,WSH,120,C,27,R,0.0,0.55,0.88,1.15,...,2.66,2.85,3.04,3.23,3.41,3.6,3.79,3.98,4.19,Riley Adams


In [4]:
# Iterate through each row and clean the 'name' column using unidecode
for index, row in running_df.iterrows():
    cleaned_name = unidecode(row['Name'])
    running_df.at[index, 'Name'] = cleaned_name

In [5]:
handedness_df = running_df[["Name", "player_id", "bat_side"]]

In [14]:
# Identify rows with duplicated player IDs and 'R' in 'bat_side' column
duplicated_mask = handedness_df.duplicated(subset=['player_id'], keep=False)

# Update 'bat_side' to 'S' for the duplicated rows
handedness_df.loc[duplicated_mask, 'bat_side'] = 'S'

# Drop duplicates based on player IDs and 'bat_side'
handedness_df = handedness_df.drop_duplicates(subset=['player_id', 'bat_side'])
handedness_df.head(10)

Unnamed: 0,Name,player_id,bat_side
0,CJ Abrams,682928,L
1,Jose Abreu,547989,R
2,Ronald Acuna Jr.,660670,R
3,Willy Adames,642715,R
4,Riley Adams,656180,R
5,Nick Ahmed,605113,R
6,Hanser Alberto,593643,R
7,Ozzie Albies,645277,S
9,Jorge Alfaro,595751,R
10,Nick Allen,669397,R


In [7]:
# Save the new DataFrame to the "Resources" folder
#handedness_df.to_csv("Resources/handedness_data.csv", index=False, encoding='utf-8')

In [8]:
full_df = pd.read_csv("Resources/dash_full_batter_data.csv")
full_df.head()

Unnamed: 0,IDfg,Name,Age,G,AB,PA,H,1B,2B,3B,...,BABIP_2023,wRC+_2023,z_scores_avg_woba,z_scores_avg_slg,z_scores_avg_babip,z_scores_avg_wrc+,zscore_difference_woba,zscore_difference_slg,zscore_difference_babip,zscore_difference_wrc+
0,1744,Miguel Cabrera,37.666667,126.0,454.0,502.666667,120.333333,94.0,15.666667,0.0,...,0.305699,83.139097,-0.841291,-1.079977,0.763227,-0.824324,-0.163228,-0.335034,-0.579226,-0.081495
1,2136,David Peralta,32.666667,127.666667,436.0,483.666667,113.666667,68.666667,29.666667,4.666667,...,0.304,88.921411,-0.106384,-0.090203,0.477456,-0.278952,-0.575723,-0.321047,-0.340257,-0.385844
2,2396,Carlos Santana,34.666667,149.0,523.0,617.0,123.0,77.666667,21.0,0.333333,...,0.241071,93.193177,0.053379,-0.34184,-1.64542,0.009775,-0.459862,0.10901,0.049624,-0.496512
3,2967,Tommy Pham,32.666667,148.0,532.0,612.333333,131.666667,85.666667,26.666667,1.666667,...,0.308411,121.157846,-0.06378,-0.487231,0.099829,-0.118548,0.649372,1.222906,0.158851,0.797455
4,3473,Anthony Rizzo,30.666667,139.0,491.0,579.0,125.666667,72.0,24.333333,2.333333,...,0.298113,98.585941,1.107812,0.815692,-1.124907,1.020317,-1.456233,-1.538062,1.09999,-1.282269


In [9]:
translate_df = pd.read_csv("Resources/fg_to_mlbid.csv")
translate_df = translate_df[['PlayerId', 'MLBAMID']]

columns_to_rename = {"PlayerId": "IDfg", "MLBAMID": "player_id"}

# Rename the columns using the defined mapping
translate_df.rename(columns=columns_to_rename, inplace=True)

# Print the modified DataFrame
translate_df.head()

Unnamed: 0,IDfg,player_id
0,13611,605141
1,13510,608070
2,10155,545361
3,12916,596019
4,15640,592450


In [10]:
handy_df = pd.read_csv('Resources/handedness_data.csv')
handy_id = handy_df.merge(translate_df, on='player_id')
for_merge_handy = handy_id[['bat_side', 'IDfg' ]]

In [11]:
merged_df = for_merge_handy.merge(full_df, on='IDfg', how='right')
merged_df.head()

Unnamed: 0,bat_side,IDfg,Name,Age,G,AB,PA,H,1B,2B,...,BABIP_2023,wRC+_2023,z_scores_avg_woba,z_scores_avg_slg,z_scores_avg_babip,z_scores_avg_wrc+,zscore_difference_woba,zscore_difference_slg,zscore_difference_babip,zscore_difference_wrc+
0,R,1744,Miguel Cabrera,37.666667,126.0,454.0,502.666667,120.333333,94.0,15.666667,...,0.305699,83.139097,-0.841291,-1.079977,0.763227,-0.824324,-0.163228,-0.335034,-0.579226,-0.081495
1,L,2136,David Peralta,32.666667,127.666667,436.0,483.666667,113.666667,68.666667,29.666667,...,0.304,88.921411,-0.106384,-0.090203,0.477456,-0.278952,-0.575723,-0.321047,-0.340257,-0.385844
2,S,2396,Carlos Santana,34.666667,149.0,523.0,617.0,123.0,77.666667,21.0,...,0.241071,93.193177,0.053379,-0.34184,-1.64542,0.009775,-0.459862,0.10901,0.049624,-0.496512
3,R,2967,Tommy Pham,32.666667,148.0,532.0,612.333333,131.666667,85.666667,26.666667,...,0.308411,121.157846,-0.06378,-0.487231,0.099829,-0.118548,0.649372,1.222906,0.158851,0.797455
4,L,3473,Anthony Rizzo,30.666667,139.0,491.0,579.0,125.666667,72.0,24.333333,...,0.298113,98.585941,1.107812,0.815692,-1.124907,1.020317,-1.456233,-1.538062,1.09999,-1.282269


In [12]:
merged_df.to_csv("Resources/merged_to_check.csv", index=False, encoding='utf-8')

In [13]:
full_again = pd.read_csv("Resources/merged_to_check.csv")
full_again.tail()

Unnamed: 0,bat_side,IDfg,Name,Age,G,AB,PA,H,1B,2B,...,BABIP_2023,wRC+_2023,z_scores_avg_woba,z_scores_avg_slg,z_scores_avg_babip,z_scores_avg_wrc+,zscore_difference_woba,zscore_difference_slg,zscore_difference_babip,zscore_difference_wrc+
251,,27465,Spencer Torkelson,22.0,110.0,360.0,404.0,73.0,48.0,16.0,...,0.271605,109.352965,-1.71466,-1.885217,-1.277999,-1.498019,1.791248,2.169469,0.523068,1.684866
252,R,27506,Ha-seong Kim,25.5,133.5,392.0,440.0,92.0,59.5,20.5,...,0.323718,127.5317,-1.091586,-1.071589,-0.956506,-0.944627,1.970996,1.40026,1.636719,1.889214
253,,27676,Vinnie Pasquantino,24.0,72.0,258.0,298.0,76.0,56.0,10.0,...,0.25,104.579538,1.256923,0.312417,0.283539,1.437367,-1.189871,-0.143813,-1.63345,-1.449489
254,,27684,Michael Massey,24.0,52.0,173.0,194.0,42.0,28.0,9.0,...,0.263158,68.033804,-0.756084,-0.928995,0.222302,-0.679961,-0.684074,-0.042706,-1.209857,-0.855488
255,,30116,Seiya Suzuki,27.0,111.0,397.0,446.0,104.0,66.0,22.0,...,0.322222,107.503585,0.266396,0.027228,0.895907,0.426824,-0.097648,0.016758,-0.256885,-0.317064
