# Python Analysis Project

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## ***Reading In and Organizing Data:***

In [353]:
advanced_stats = pd.read_csv('./Baseball Data/Fangraphs Leaderboard (Advanced).csv')
standard_stats = pd.read_csv('./Baseball Data/FanGraphs Leaderboard Batted Ball Stats.csv')

#####################################################################################################################

pitch_info_hm = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info H-Movement).csv')
pitch_info_pd = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info Plate Discipline).csv')
pitch_info_velo = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info Velo).csv')
pitch_info_vm = pd.read_csv('./Baseball Data/FanGraphs Leaderboard(Pitch Info Vertical Movement).csv')
pitch_type = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info Pitch Type).csv')

######################################################################################################################

batted_ball = pd.read_csv('./Baseball Data/FanGraphs Leaderboard Batted Ball Stats.csv')

In [354]:
pd.set_option('display.max_columns', None)

- **Merging Advanced and Standard Stats**

In [355]:
standard_stats = standard_stats.drop(['Team','BABIP','playerid'],axis = 1)
#standard_stats.head()

In [356]:
std_adv = pd.merge(advanced_stats,standard_stats, on = "Name")

In [357]:
std_adv = std_adv.drop('Team',axis = 1)

In [358]:
first_column = std_adv.pop('playerid')
std_adv.insert(0, 'playerid', first_column)

In [359]:
#std_adv.head()

In [360]:
#std_adv.loc[std_adv.Name == 'Aaron Nola']

- **Merging Pitch/fx Data**

In [361]:
#print(list(pitch_type.columns))
#print(list(pitch_info_pd.columns))
#print(list(pitch_info_hm.columns))
#print(list(pitch_info_vm.columns))
#print(list(pitch_info_velo.columns))

In [362]:
pitch = pd.merge(pitch_type,
                 pitch_info_velo.drop(['Name','Team','IP'],axis = 1), on = 'playerid')
pitch = pd.merge(pitch,
                    pitch_info_hm.drop(['Name','Team','IP'],axis = 1),on = 'playerid')
pitch = pd.merge(pitch,
                    pitch_info_vm.drop(['Name','Team','IP'],axis = 1),on = 'playerid')
pitch = pd.merge(pitch,
                    pitch_info_pd.drop(['Name','Team','IP'],axis = 1),on = 'playerid')
first_column = pitch.pop('playerid')
pitch.insert(0, 'playerid', first_column)

In [363]:
first_column = pitch.pop('playerid')
pitch.insert(0, 'playerid', first_column)

In [364]:
pitch = pitch.drop('Team',axis = 1)

In [365]:
pitch = pitch.sort_values(by = 'IP', ascending = False)

In [366]:
#pitch.head()

- **Batted Ball Data**

In [367]:
batted_ball = batted_ball.drop('Team',axis = 1)

In [368]:
first_column = batted_ball.pop('playerid')
batted_ball.insert(0, 'playerid', first_column)

In [369]:
#batted_ball.head()

In [370]:
#std_adv.head()

In [371]:
for feature in list(batted_ball.columns):
    if feature in (list(std_adv.columns)[2:]):
        batted_ball = batted_ball.drop(feature,axis = 1) 

In [372]:
std_adv_bat = pd.merge(std_adv,batted_ball, on = ['playerid','Name'])

In [373]:
#std_adv_bat

- **Incorperating Spin Rate Data**

In [374]:
spin = pd.read_csv('./Baseball Data/spin_rate_2017-2021.csv')

In [375]:
spin['Name'] = [f'{first} {last}' for first,last in zip(spin[list(spin.columns)[1]],spin[list(spin.columns)[0]])]

In [376]:
first_column = spin.pop('Name')
spin.insert(0, 'Name', first_column)

In [377]:
spin = spin.drop(['p_formatted_ip','p_strikeout','p_k_percent','p_bb_percent'
                  ,'player_id','year',' first_name','last_name','Unnamed: 17'], axis = 1)

In [378]:
avg_spin_rates = spin.groupby(by = 'Name').mean()

In [379]:
avg_spin_rates = avg_spin_rates.reset_index()

In [380]:
avg_spin_rates['Name'] = avg_spin_rates['Name'].apply(lambda x: x.strip())

**Savant pitch keys**

- ff = 4 seam
- sl = slider
- ch = changeup
- cu = curveball
- si = sinker
- fc = cut fastball
- fs = split fastball
- kn = knuckleball

**Fangraphs Pitch Keys**

- FA = fourseam or unclassified fastballs
- FT = two seam fastball
- FC = cut fastball
- FS = split fastball
- FO = forkball
- SI = sinker
- SL = slider
- CU = curveball
- KC = knuckle curve
- EP = Ephes
- CH = changeup
- SC = screwball
- KN = knuckleball
- UN = unknowns

In [381]:
pitch_plus = pd.merge(pitch.sort_values(by = 'Name'),avg_spin_rates,on = 'Name')
#pitch_plus.head(10)

In [382]:
missing_player = list(std_adv_bat.loc[~std_adv_bat['Name'].isin(pitch_plus['Name'])]['Name'])[0]

In [383]:
missing_player

'Hyun-Jin Ryu'

In [384]:
std_adv_bat = std_adv_bat.loc[std_adv_bat.Name != missing_player]

In [385]:
std_adv_bat.columns

Index(['playerid', 'Name', 'K/9', 'BB/9', 'K/BB', 'HR/9', 'K%', 'BB%', 'K-BB%',
       'AVG', 'WHIP', 'BABIP', 'LOB%', 'ERA-', 'FIP-', 'xFIP-', 'ERA', 'FIP',
       'E-F', 'xFIP', 'SIERA', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB',
       'RS', 'RS/9', 'Balls', 'Strikes', 'Pitches', 'Pull%', 'Cent%', 'Oppo%',
       'Soft%', 'Med%', 'Hard%'],
      dtype='object')

In [386]:
pitch_plus.columns

Index(['playerid', 'Name', 'IP', 'FA%', 'FC%', 'FS%', 'SI%', 'CH%', 'SL%',
       'CU%', 'CS%', 'KN%', 'SB%', 'XX%', 'vFA', 'vFC', 'vFS', 'vSI', 'vCH',
       'vSL', 'vCU', 'vCS', 'vKN', 'vSB', 'FA-X', 'FC-X', 'FS-X', 'SI-X',
       'CH-X', 'SL-X', 'CU-X', 'CS-X', 'KN-X', 'SB-X', 'FA-Z', 'FC-Z', 'FS-Z',
       'SI-Z', 'CH-Z', 'SL-Z', 'CU-Z', 'CS-Z', 'KN-Z', 'SB-Z', 'O-Swing%',
       'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%',
       'Pace', 'ff_avg_spin', 'sl_avg_spin', 'ch_avg_spin', 'cu_avg_spin',
       'si_avg_spin', 'fc_avg_spin', 'fs_avg_spin', 'kn_avg_spin',
       'fastball_avg_spin'],
      dtype='object')

In [389]:
master_stats = pd.merge(std_adv_bat,pitch_plus, on = ['playerid','Name']).sort_values(by = 'playerid')
#master_stats

In [390]:
master_stats = master_stats.dropna(axis=1, how='all', thresh=None, subset=None)

In [391]:
master_stats

Unnamed: 0,playerid,Name,K/9,BB/9,K/BB,HR/9,K%,BB%,K-BB%,AVG,WHIP,BABIP,LOB%,ERA-,FIP-,xFIP-,ERA,FIP,E-F,xFIP,SIERA,GB/FB,LD%,GB%,FB%,IFFB%,HR/FB,RS,RS/9,Balls,Strikes,Pitches,Pull%,Cent%,Oppo%,Soft%,Med%,Hard%,IP,FA%,FC%,FS%,SI%,CH%,SL%,CU%,CS%,KN%,XX%,vFA,vFC,vFS,vSI,vCH,vSL,vCU,vCS,vKN,FA-X,FC-X,FS-X,SI-X,CH-X,SL-X,CU-X,CS-X,KN-X,FA-Z,FC-Z,FS-Z,SI-Z,CH-Z,SL-Z,CU-Z,CS-Z,KN-Z,O-Swing%,Z-Swing%,Swing%,O-Contact%,Z-Contact%,Contact%,Zone%,ff_avg_spin,sl_avg_spin,ch_avg_spin,cu_avg_spin,si_avg_spin,fc_avg_spin,fs_avg_spin,fastball_avg_spin
106,404,CC Sabathia,8.05,3.09,2.61,1.48,0.208,0.080,0.128,0.252,1.33,0.287,0.772,91,104,99,4.01,4.69,-0.67,4.36,4.44,1.33,0.209,0.452,0.339,0.107,0.163,261,5.76,2326,4237,6563,0.439,0.351,0.210,0.231,0.464,0.305,408.0,0.007,0.385,,0.171,0.125,0.307,,,,,90.9,89.2,,90.4,83.6,79.8,,,,2.6,0.1,,8.4,6.1,-6.5,,,,7.1,5.3,,4.3,3.5,-0.7,,,,0.288,0.654,0.470,0.615,0.857,0.782,0.495,2091.000000,2231.666667,1868.666667,,2027.333333,2190.000000,,2139.333333
126,1118,Marco Estrada,7.39,3.29,2.25,1.71,0.188,0.084,0.105,0.259,1.40,0.284,0.704,124,117,129,5.37,5.14,0.23,5.59,5.13,0.50,0.195,0.268,0.537,0.171,0.113,199,5.07,2183,3971,6154,0.404,0.323,0.273,0.210,0.498,0.293,353.1,0.510,0.066,,0.000,0.335,0.000,0.074,,,,89.3,85.8,,76.7,77.3,90.8,77.1,,,-2.3,2.6,,-5.1,-5.5,-2.2,3.5,,,11.6,4.6,,10.3,9.0,12.2,-7.7,,,0.262,0.697,0.485,0.711,0.813,0.786,0.512,2344.000000,,2073.000000,2641.000000,,2489.500000,,2360.500000
20,1943,Zack Greinke,8.45,1.69,5.01,1.08,0.236,0.047,0.189,0.232,1.06,0.279,0.765,76,82,84,3.26,3.47,-0.21,3.61,3.79,1.33,0.219,0.446,0.335,0.086,0.128,474,5.29,4391,7955,12346,0.403,0.360,0.237,0.175,0.469,0.356,806.1,0.417,0.003,0.003,0.054,0.201,0.170,0.136,0.012,,0.00,89.7,85.7,76.9,90.3,86.8,83.7,71.7,66.6,,-0.9,3.7,0.3,-7.0,-7.0,4.1,6.8,6.8,,8.6,1.7,7.7,6.1,1.4,0.2,-8.2,-8.6,,0.305,0.619,0.454,0.598,0.847,0.759,0.475,2322.800000,2487.800000,1720.200000,2458.600000,2258.000000,2409.000000,1703.000000,2315.800000
114,1994,Ivan Nova,5.98,2.06,2.90,1.45,0.156,0.054,0.102,0.280,1.35,0.303,0.723,105,109,104,4.50,4.73,-0.23,4.51,4.74,1.46,0.222,0.462,0.316,0.091,0.156,286,4.65,2999,5496,8495,0.428,0.356,0.217,0.167,0.498,0.335,554.0,0.243,0.000,,0.372,0.127,0.046,0.197,,,,93.1,86.2,,92.5,86.5,86.8,80.4,,,-7.2,-10.0,,-9.7,-9.3,-2.0,1.7,,,6.2,3.6,,3.2,3.0,3.5,-4.9,,,0.286,0.681,0.489,0.636,0.900,0.825,0.513,2264.000000,,1994.333333,2286.333333,2162.333333,2142.000000,,2200.333333
15,2036,Clayton Kershaw,9.74,1.69,5.78,1.18,0.274,0.047,0.227,0.215,0.99,0.266,0.816,68,78,73,2.76,3.32,-0.56,3.13,3.35,1.50,0.194,0.484,0.322,0.111,0.157,445,5.90,3213,6781,9994,0.475,0.324,0.201,0.203,0.447,0.350,678.1,0.413,,,0.008,0.007,0.393,0.160,,,,91.4,,,92.1,85.8,87.8,73.6,,,-0.4,,,4.8,5.7,-2.6,-1.8,,,10.1,,,9.8,8.8,4.1,-11.2,,,0.315,0.691,0.507,0.506,0.838,0.737,0.512,2429.250000,2581.000000,2182.000000,2456.000000,2333.000000,,,2429.250000
65,2233,Adam Wainwright,7.89,3.01,2.62,1.13,0.205,0.078,0.127,0.255,1.33,0.301,0.746,100,99,98,4.16,4.21,-0.05,4.22,4.47,1.58,0.232,0.470,0.298,0.089,0.142,295,5.13,3111,5382,8493,0.423,0.344,0.232,0.168,0.490,0.342,517.2,0.133,0.225,,0.258,0.032,,0.333,0.000,,0.00,89.9,85.0,,89.9,82.9,,74.0,64.2,,-1.9,3.4,,-7.4,-8.3,,10.1,9.5,,7.0,2.3,,5.1,3.0,,-9.6,-8.2,,0.244,0.594,0.427,0.591,0.891,0.809,0.521,2208.750000,,1748.750000,2756.000000,2192.500000,2367.500000,,2260.500000
5,2429,Corey Kluber,10.28,1.93,5.33,0.97,0.289,0.054,0.235,0.215,1.01,0.280,0.776,64,70,72,2.85,3.01,-0.16,3.11,3.23,1.30,0.216,0.443,0.341,0.102,0.125,310,5.48,2539,5064,7603,0.394,0.336,0.270,0.192,0.478,0.330,508.2,0.110,0.266,,0.289,0.073,0.245,0.000,,,0.00,92.4,88.7,,92.2,85.4,83.9,83.6,,,-3.4,2.1,,-8.4,-7.1,8.9,8.9,,,7.4,2.3,,3.9,1.4,-0.2,-0.3,,,0.323,0.629,0.482,0.448,0.846,0.718,0.520,2391.000000,,1729.000000,2598.000000,2246.500000,2592.000000,,2402.500000
43,2520,Lance Lynn,9.29,3.34,2.78,1.05,0.245,0.088,0.157,0.230,1.24,0.285,0.761,80,89,96,3.60,3.88,-0.28,4.19,4.23,1.17,0.212,0.424,0.363,0.113,0.118,405,5.02,4729,7797,12526,0.375,0.343,0.282,0.185,0.484,0.332,725.2,0.453,0.162,,0.272,0.025,0.004,0.071,,,,93.9,88.5,,92.2,86.1,86.0,80.9,,,-3.7,1.6,,-7.4,-7.1,2.9,4.2,,,7.5,2.2,,3.1,2.8,-0.3,-6.3,,,0.250,0.688,0.472,0.608,0.828,0.771,0.508,2428.000000,,1803.600000,2352.400000,2268.200000,2531.600000,,2406.800000
96,2608,Jhoulys Chacin,7.75,3.55,2.19,1.18,0.202,0.093,0.110,0.235,1.29,0.272,0.729,99,107,108,4.21,4.53,-0.32,4.60,4.68,1.27,0.210,0.441,0.348,0.115,0.131,222,4.17,2969,4957,7926,0.423,0.344,0.232,0.186,0.457,0.357,479.1,0.136,0.005,0.031,0.353,0.030,0.414,0.023,,,,90.8,88.0,84.7,90.7,83.5,79.8,76.4,,,-4.5,0.7,-4.7,-8.9,-5.5,8.0,8.2,,,7.1,3.2,1.5,3.6,3.0,0.4,-4.2,,,0.229,0.606,0.425,0.606,0.876,0.806,0.519,2167.000000,2400.333333,1437.333333,2482.333333,2075.333333,,1598.000000,2101.666667
76,2717,Rick Porcello,8.14,2.24,3.64,1.45,0.209,0.057,0.151,0.270,1.34,0.313,0.677,106,100,101,4.87,4.35,0.52,4.45,4.30,1.04,0.212,0.401,0.387,0.106,0.134,421,6.03,3467,6955,10422,0.418,0.354,0.228,0.193,0.459,0.349,628.0,0.256,,0.000,0.293,0.108,0.204,0.132,,,,91.7,,92.5,89.9,81.2,85.6,74.8,,,-4.8,,-4.6,-9.0,-8.3,1.9,8.6,,,8.1,,7.5,4.5,2.7,2.0,-7.2,,,0.272,0.662,0.484,0.690,0.871,0.825,0.543,2401.750000,2491.500000,1665.750000,2770.000000,2277.250000,,,2330.000000


## ***Visualizing Data:***

- **Correlations**

In [392]:
master_stats_corr = pd.DataFrame(master_stats.corr())

In [393]:
fig = px.imshow(master_stats_corr,width=950, height=950,color_continuous_scale = 'spectral',
               title = 'Batted Ball,Advanced, and Standard Stats Heatmap')
fig.show()

## ***Visualizing Pitch Data:***

- **Correlations**

In [394]:
fig = px.scatter(master, x="vFA", y="ff_avg_spin",
                 size="IP", color ="Hard%" ,
                 hover_name="Name", log_x=False, size_max=20, width=900, height=900,
                 title = "")

fig.show()