# Pitcher Profile

### This app analyzes MLB pitcher arsenals based on metrics such as pitch type, pitch usage, whiff percent, and strike percent during the 2023 MLB regular season.

In [67]:
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px
import numpy as np
from pybaseball import statcast_pitcher_arsenal_stats

### Get pitcher data

In [68]:

pitch_data = statcast_pitcher_arsenal_stats(2023, minPA=0, )
pitch_csv = pitch_data.to_csv('pitch_data.csv', sep=',')
pitch_2023 = pd.read_csv('pitch_data.csv')

In [69]:

def load_data():
    return pitch_2023
df = load_data()

### View the data

In [70]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,"last_name, first_name",player_id,team_name_alt,pitch_type,pitch_name,run_value_per_100,run_value,pitches,pitch_usage,...,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent
1662,1662,"Adcock, Ty",686654,SEA,SL,Slider,-1.3,-2,116,56.0,...,0.226,0.613,0.403,23.3,9.7,12.5,0.369,0.733,0.459,57.1
2291,2291,"Rodríguez, Dereck",605446,ATL,FF,4-Seam Fastball,2.8,1,35,36.1,...,0.333,0.333,0.369,0.0,0.0,0.0,0.289,0.37,0.366,30.0
945,945,"Ruiz, José",614179,AZ,FF,4-Seam Fastball,-2.0,-5,257,31.3,...,0.236,0.491,0.399,33.9,23.2,20.3,0.22,0.459,0.367,43.6
2776,2776,"Kriske, Brooks",621139,KC,SL,Slider,4.5,1,20,21.7,...,0.2,0.2,0.18,12.5,20.0,20.0,0.298,0.409,0.307,25.0
759,759,"Nelson, Ryne",669194,AZ,SL,Slider,-0.3,-1,316,13.4,...,0.177,0.38,0.255,26.6,19.3,12.5,0.184,0.3,0.242,26.6
2058,2058,"Englert, Mason",669438,DET,CU,Curveball,-1.8,-1,64,6.7,...,0.316,0.421,0.368,20.0,15.8,16.7,0.324,0.42,0.323,37.5
1772,1772,"Flexen, Chris",623167,COL,SL,Slider,-2.3,-3,117,6.7,...,0.296,0.407,0.321,27.8,35.7,15.6,0.226,0.343,0.26,35.3
1998,1998,"Leiter Jr., Mark",643410,CHC,FF,4-Seam Fastball,-3.6,-3,86,8.1,...,0.389,0.944,0.568,21.2,20.0,14.3,0.358,0.795,0.503,42.9
3080,3080,"Lucas, Easton",687922,OAK,CH,Changeup,2.9,0,11,7.4,...,0.0,0.0,0.0,66.7,50.0,25.0,0.112,0.124,0.101,0.0
2734,2734,"Herget, Jimmy",623474,LAA,CH,Changeup,-12.9,-4,30,5.9,...,0.333,1.0,0.542,28.6,33.3,16.7,0.315,0.624,0.394,75.0


In [71]:
df.describe()

Unnamed: 0.1,Unnamed: 0,player_id,run_value_per_100,run_value,pitches,pitch_usage,pa,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent
count,3291.0,3291.0,3291.0,3291.0,3291.0,3291.0,3291.0,3273.0,3273.0,3290.0,3280.0,3291.0,3216.0,3228.0,3228.0,3290.0,3229.0
mean,1645.0,630851.068976,-0.905287,0.02917,217.367669,24.348982,55.566393,0.266683,0.451433,0.350826,25.05561,21.332574,17.587469,0.260729,0.435831,0.33166,38.455218
std,950.174195,55239.784353,6.04105,4.271102,252.598304,17.066225,65.879069,0.17261,0.374549,0.20107,14.121354,16.466813,11.633709,0.113406,0.264262,0.145369,20.177301
min,0.0,425794.0,-116.9,-23.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,822.5,605463.0,-1.9,-2.0,40.0,10.65,10.0,0.184,0.279,0.25625,16.4,10.8,11.6,0.199,0.304,0.258,27.4
50%,1645.0,656457.0,-0.2,0.0,130.0,21.5,32.0,0.25,0.4,0.333,24.4,20.0,17.5,0.252,0.403,0.321,38.1
75%,2467.5,669330.0,1.3,1.0,305.5,35.4,78.0,0.323,0.538,0.411,33.3,29.55,22.8,0.304,0.50925,0.385,48.4
max,3290.0,701643.0,48.7,29.0,1826.0,89.8,455.0,1.0,4.0,2.0,100.0,100.0,100.0,1.0,4.0,1.912,100.0


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3291 entries, 0 to 3290
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             3291 non-null   int64  
 1   last_name, first_name  3291 non-null   object 
 2   player_id              3291 non-null   int64  
 3   team_name_alt          3291 non-null   object 
 4   pitch_type             3291 non-null   object 
 5   pitch_name             3291 non-null   object 
 6   run_value_per_100      3291 non-null   float64
 7   run_value              3291 non-null   int64  
 8   pitches                3291 non-null   int64  
 9   pitch_usage            3291 non-null   float64
 10  pa                     3291 non-null   int64  
 11  ba                     3273 non-null   float64
 12  slg                    3273 non-null   float64
 13  woba                   3290 non-null   float64
 14  whiff_percent          3280 non-null   float64
 15  k_pe

### Check for dupicates

In [73]:
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0.1,Unnamed: 0,"last_name, first_name",player_id,team_name_alt,pitch_type,pitch_name,run_value_per_100,run_value,pitches,pitch_usage,...,ba,slg,woba,whiff_percent,k_percent,put_away,est_ba,est_slg,est_woba,hard_hit_percent


### Identify and replace missing values with zero

In [74]:
df.isna().sum()

Unnamed: 0                0
last_name, first_name     0
player_id                 0
team_name_alt             0
pitch_type                0
pitch_name                0
run_value_per_100         0
run_value                 0
pitches                   0
pitch_usage               0
pa                        0
ba                       18
slg                      18
woba                      1
whiff_percent            11
k_percent                 0
put_away                 75
est_ba                   63
est_slg                  63
est_woba                  1
hard_hit_percent         62
dtype: int64

In [75]:
full_df = df.fillna(0)

### recheck for missing values

In [76]:
full_df.isna().sum()

Unnamed: 0               0
last_name, first_name    0
player_id                0
team_name_alt            0
pitch_type               0
pitch_name               0
run_value_per_100        0
run_value                0
pitches                  0
pitch_usage              0
pa                       0
ba                       0
slg                      0
woba                     0
whiff_percent            0
k_percent                0
put_away                 0
est_ba                   0
est_slg                  0
est_woba                 0
hard_hit_percent         0
dtype: int64

In [87]:
pitch_list = ['FC', 'FF', 'CH', 'SI', 'CU', 'SL', 'FS', 'SV', 'ST', 'KN', 'SC']
for i in pitch_list:
    pitch_counts = full_df['pitch_type'].value_counts()[f'{i}']
    print(pitch_counts)

278
718
554
488
363
589
81
9
207
3
1


#### 4-seam fastballs are the fundamental weapon, appearing in 718 pitchers arsenals, followed by sliders(589), changeups(554), sinkers(488), curveballs(363), cutters(278), sweepers(207), splitters(81), slurves(9), and knuckleballs(3), with only one pitcher utilizing the screwball.

### Scatterplot of comparing put-away percent, hard hit percent, and whiff percent to strike percent

In [77]:

k_scatter = px.scatter(full_df, x='k_percent', y=['put_away', 'hard_hit_percent', 'whiff_percent'])
k_scatter

### Distributions of the metrics we'll be looking at, across all pitches

In [82]:
stats_df = full_df.filter(items=['pitch_usage', 'whiff_percent', 'k_percent', 'put_away', 'hard_hit_percent'])
fig = go.Figure()

for col in stats_df:
  fig.add_trace(go.Box(y=stats_df[col].values, name=stats_df[col].name))
  
fig.show()
    

## Take aways:
#### 1. Most pitch usage was between 2.5 - 17.4%, which means that the majority of MLB pitchers use most pitch types sparingly while leaning on others more heavily.
#### 2. The probable likelyhood that a pitch is whiffed-on falls between 15 - 34.9%
#### 3. The strike percent across pitches has a positive skew, with most pitches types thrown for strikes < 40% of the time.
#### 4. The majority of pitch types thrown for an out are between 15 - 24.9%
#### 5. The median hard hit percent across all pitches is 37.6%



### Check sidebar filtering

In [58]:

team = df['team_name_alt'].sort_values().unique()
team_choice = np.random.choice(team)
pitcher_choice = df['last_name, first_name'].drop_duplicates().loc[df['team_name_alt'] == team_choice]
pitcher =  np.random.choice(pitcher_choice)
pitcher


'Turnbull, Spencer'

 ### Define selected pitcher data to use

In [59]:
results_df = full_df.loc[df['last_name, first_name'] == pitcher].drop('Unnamed: 0', axis=1).reset_index(drop=True)
filtered_df = results_df.filter(items=['pitch_name', 'pitches', 'pitch_usage', 'whiff_percent', 'k_percent', 'put_away', 'hard_hit_percent'])
columns = ['pitches', 'pitch_usage', 'whiff_percent', 'k_percent', 'put_away', 'hard_hit_percent']

### Create a histogram for each metric

In [60]:

for col in columns:
    
    fig = px.histogram(filtered_df[f'{col}'], x=filtered_df['pitch_name'], y=filtered_df[f'{col}'], labels={'x': 'Pitch'})  
    fig.update_layout(yaxis_title=f'{col}')
    fig.show()

## Conclusion
### Different pitchers have different arsenals, with some pitches in those arsenals utilized dominantly, while others are peppered in for variety depending largely on the pitcher.  4-seam fastballs are the most utilzed pitch across all pitchers, while there's only one screwballer among the pack.  Here we give some visualization to the pitch type that each pitcher has in their arsenal and pair it with some crucial metrics for analyzing those pitch's efficacy.  