# Python Analysis Project

In [1022]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

## ***Reading In and Organizing Data:***

In [880]:
advanced_stats = pd.read_csv('./Baseball Data/Fangraphs Leaderboard (Advanced).csv')
standard_stats = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Standard).csv')
pitch_arsenal = pd.read_csv('./Baseball Data/Pitch_Arsenal_2017-2021.csv') 
#####################################################################################################################

# pitch_info_hm = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info H-Movement).csv')
# pitch_info_pd = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info Plate Discipline).csv')
# pitch_info_velo = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info Velo).csv')
# pitch_info_vm = pd.read_csv('./Baseball Data/FanGraphs Leaderboard(Pitch Info Vertical Movement).csv')
# pitch_type = pd.read_csv('./Baseball Data/FanGraphs Leaderboard (Pitch Info Pitch Type).csv')

######################################################################################################################

#batted_ball = pd.read_csv('./Baseball Data/FanGraphs Leaderboard Batted Ball Stats.csv')

In [881]:
pd.set_option('display.max_columns', None)

In [1043]:
pd.set_option('display.max_rows', None)

In [882]:
#standard_stats

- **Merging Advanced and Standard Stats**

In [884]:
standard_stats = standard_stats.drop('Team',axis = 1)

In [891]:
std_adv = pd.merge(standard_stats,advanced_stats, on = ["Name","playerid","ERA"])
#std_adv

In [892]:
std_adv = std_adv.drop('Team',axis = 1)

In [893]:
first_column = std_adv.pop('playerid')
std_adv.insert(0, 'playerid', first_column)

In [894]:
#std_adv = std_adv.dropna(axis=1, how='all', thresh=None, subset=None)

In [1163]:
#std_adv

**Savant pitch keys**

- ff = 4 seam
- sl = slider
- ch = changeup
- cu = curveball
- si = sinker
- fc = cut fastball
- fs = split fastball
- kn = knuckleball

**Fangraphs Pitch Keys**

- FA = fourseam or unclassified fastballs
- FT = two seam fastball
- FC = cut fastball
- FS = split fastball
- FO = forkball
- SI = sinker
- SL = slider
- CU = curveball
- KC = knuckle curve
- EP = Ephes
- CH = changeup
- SC = screwball
- KN = knuckleball
- UN = unknowns

**Statcast Pitch Arsenal Data 2017-2021 from Baseball Savant**

In [854]:
pitch_arsenal = pd.read_csv('./Baseball Data/Pitch_Arsenal_2017-2021.csv') 

In [855]:
pitch_arsenal['Name'] = [f'{first.strip()} {last.strip()}' for first,last in 
                         zip(pitch_arsenal[list(pitch_arsenal.columns)[1]],pitch_arsenal[list(pitch_arsenal.columns)[0]])]

In [856]:
first_column = pitch_arsenal.pop('Name')
pitch_arsenal.insert(0, 'Name', first_column)

In [857]:
cols = [c.strip() for c in list(pitch_arsenal.columns)]
pitch_arsenal.columns = cols

In [858]:
agg_counts = pitch_arsenal[['Name','out_zone','pitch_count_offspeed','pitch_count_fastball',
                            'pitch_count_breaking','pitch_count','in_zone',
                           'batted_ball','groundballs','flyballs','linedrives',
                           'pitch_hand','n']].groupby(by = 'Name')\
                            .sum().sort_values(by = 'pitch_count',ascending = False)
agg_counts = agg_counts.reset_index()

In [859]:
sums = ['out_zone','pitch_count_offspeed','pitch_count_fastball',
                            'pitch_count_breaking','pitch_count','in_zone',
                           'batted_ball','groundballs','flyballs','linedrives',
                           'pitch_hand','n']

In [860]:
pitch_arsenal_ave = pitch_arsenal.drop(sums, axis = 1).groupby(by = 'Name').mean()

In [861]:
agg_pitch_arsenal = pd.merge(agg_counts,pitch_arsenal_ave, on = ['Name'])
#agg_pitch_arsenal.rename(columns = {'p_total_pa':'total_batters_faced'}, inplace = True)

## ***Visualizing Data:***

- **Correlations**

- fangraphs standard and advanced stat correlations

In [897]:
std_adv_corr = pd.DataFrame(std_adv.drop(['SV','HLD','BS'],axis = 1).corr())
#std_adv_corr = pd.DataFrame(std_adv.corr()).dropna(how = 'all')

In [1162]:
fig = px.imshow(std_adv_corr,width=750, height=750,color_continuous_scale = 'spectral',
               title = 'Standard, Advanced, and Batted Ball Statistical Categories Heatmap')
#fig.show()

In [1037]:
fg_30 = {col:std_adv_corr[f'{col}'].apply(abs).sort_values(ascending = False)[1:31] for col in list(std_adv_corr.columns)}

- Statcast and fangraphs stat correlations

In [1218]:
fg_sav = pd.merge(std_adv,agg_pitch_arsenal,on = 'Name')
#fg_sav.sort_values(by = 'pitch_count', ascending = False).head(10)

In [1221]:
fg_sav_corr = pd.DataFrame(fg_sav.corr())
fig = px.imshow(fg_sav_corr,width=1000, height=1000,color_continuous_scale = 'spectral',
               title = '')
fig.show()

- Statcast data correlations

In [1031]:
savant_data_corr = pd.DataFrame(agg_pitch_arsenal.corr())

In [1038]:
fig = px.imshow(savant_data_corr,width=1000, height=1000,color_continuous_scale = 'spectral',
               title = '')
#fig.show()

- **Baseball Savant Column Features**

In [1046]:
print(pd.Series(fg_sav_corr.columns))

0                   playerid
1                          W
2                          L
3                        ERA
4                          G
5                         GS
6                         CG
7                        ShO
8                         SV
9                        HLD
10                        BS
11                        IP
12                       TBF
13                         H
14                         R
15                        ER
16                        HR
17                        BB
18                       IBB
19                       HBP
20                        WP
21                        BK
22                        SO
23                       K/9
24                      BB/9
25                      K/BB
26                      HR/9
27                        K%
28                       BB%
29                     K-BB%
30                       AVG
31                      WHIP
32                     BABIP
33                      LOB%
34            

In [1222]:
abs(fg_sav_corr['BABIP']).sort_values()

year                     0.000339
xFIP-                    0.002726
swing_percent            0.003184
sl_avg_break_z           0.003510
xFIP                     0.007399
si_avg_speed             0.007801
batted_ball              0.008016
n_breaking_formatted     0.008511
breaking_avg_break_z     0.008901
oz_swing_miss_percent    0.012902
oz_contact_percent       0.014475
ff_range_speed           0.015057
fs_avg_break             0.015754
BB/9                     0.021784
ch_avg_spin              0.022644
si_range_speed           0.022725
offspeed_avg_speed       0.026183
n_fastball_formatted     0.026496
xiso                     0.026807
BK                       0.026923
fc_avg_speed             0.028653
HR/9                     0.034254
WP                       0.034596
IBB                      0.035031
pitch_count_breaking     0.037039
n_offspeed_formatted     0.040226
fc_avg_break_z           0.040601
ch_avg_break_z           0.042712
ff_avg_speed             0.043994
offspeed_avg_s

## **Investigating Batting Average on Balls in Play**

In [1192]:
fig = px.scatter(fg_sav, x="popups_percent", y="BABIP",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "BABIP VS. Popup% Starting Pitchers from 2017-2021")
fig.show()

In [1193]:
fig = px.scatter(fg_sav, x="groundballs_percent", y="BABIP",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "BABIP VS. Ground Ball % Starting Pitchers from 2017-2021")
fig.show()

In [1194]:
fig = px.scatter(fg_sav, x="flyballs_percent", y="BABIP",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "BABIP VS. Flyball% Starting Pitchers from 2017-2021")
fig.show()

In [1195]:
fig = px.scatter(fg_sav, x="exit_velocity_avg", y="BABIP",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "BABIP VS. avg Exit Velocity Starting Pitchers from 2017-2021")
fig.show()

In [1196]:
fig = px.scatter(fg_sav, x="barrel_batted_rate", y="BABIP",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15
                 , width=700, height=700,
                 title = "BABIP VS. Barrel% Starting Pitchers from 2017-2021")
fig.show()

In [1199]:
fig = px.scatter(fg_sav, x="sweet_spot_percent", y="BABIP",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15
                 , width=700, height=700,
                 title = "BABIP VS. Sweet Spot% Starting Pitchers from 2017-2021")
fig.show()

In [1224]:
fig = px.scatter(fg_sav, x="launch_angle_avg", y="BABIP",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15
                 , width=700, height=700,
                 title = "BABIP VS. Avg Launch Angle Starting Pitchers from 2017-2021")
fig.show()

## **Investigating ERA**

In [1161]:
fig = px.scatter(fg_sav, x="BABIP", y="E-F",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15
                 , width=700, height=700,
                 title = "E-F VS. BABIP Starting Pitchers from 2017-2021")


#px.plot()

fig.show()

In [1141]:
fig = px.scatter(fg_sav, x="BABIP", y="ERA",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "ERA VS. BABIP Starting Pitchers from 2017-2021")


#px.plot()

fig.show()

In [1137]:
fig = px.scatter(fg_sav, x="K%", y="ERA",
                 size = 'pitch_count',color ="IP",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "ERA VS. K% Starting Pitchers from 2017-2021")


#px.plot()

fig.show()

In [1148]:
fig = px.scatter(fg_sav, x="BB%", y="ERA",
                 size = 'pitch_count',color ="K%",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "ERA VS. BB% Starting Pitchers from 2017-2021")


#px.plot()

fig.show()

In [1223]:
fig = px.scatter(fg_sav.sort_values(by = 'K-BB%',ascending = True), x="K-BB%", y="ERA",
                 size = 'pitch_count',color ="FIP",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "ERA VS. K-BB% Starting Pitchers from 2017-2021")


#px.plot()

fig.show()

In [1147]:
fig = px.scatter(fg_sav.sort_values(by = 'K-BB%',ascending = False), x="BB%", y="K%",
                 size = 'pitch_count',color ="ERA",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "K% VS. BB% Starting Pitchers from 2017-2021")


#px.plot()

fig.show()

In [1149]:
fig = px.scatter(fg_sav.sort_values(by = 'ERA',ascending = True), x="ERA", y="xwoba",
                 size = 'pitch_count',color ="E-F",
                 hover_name="Name", log_x=False, size_max=15, width=700, height=700,
                 title = "FIP vs ERA Starting Pitchers from 2017-2021")


#px.plot()

fig.show()

In [1080]:
fig = ff.create_distplot([np.array(fg_sav['E-F'])],group_labels = ['ERA - FIP'],bin_size = .2)
fig.update_layout(title_text='ERA - FIP')
fig.show()

In [1122]:
fig = ff.create_distplot([np.array(fg_sav['BABIP'])],group_labels = ['Batting Average on Balls in Play'],bin_size = .0075)
fig.update_layout(title_text='BABIP')
fig.show()