In [1]:
# Import Modules
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image


In [2]:
# Import the pitcher data
verlander_df = pd.read_csv(Path("../resources/verlander_update.csv"))

display(verlander_df.head(20))

Unnamed: 0,pitch_type,game_date,player_name,batter,pitcher,events,description,zone,des,stand,...,on_1b,outs_when_up,inning,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment
0,SL,9/16/2022,"Verlander, Justin",669127,434378,strikeout,swinging_strike,9,Shea Langeliers strikes out swinging.,R,...,,2,5,35,5,Slider,0,2,Infield shift,Standard
1,FF,9/16/2022,"Verlander, Justin",669127,434378,,foul,3,Shea Langeliers strikes out swinging.,R,...,,2,5,35,4,4-Seam Fastball,0,2,Infield shift,Standard
2,SL,9/16/2022,"Verlander, Justin",669127,434378,,called_strike,13,Shea Langeliers strikes out swinging.,R,...,,2,5,35,3,Slider,0,2,Infield shift,Standard
3,FF,9/16/2022,"Verlander, Justin",669127,434378,,foul,1,Shea Langeliers strikes out swinging.,R,...,,2,5,35,2,4-Seam Fastball,0,2,Infield shift,Standard
4,SL,9/16/2022,"Verlander, Justin",669127,434378,,ball,8,Shea Langeliers strikes out swinging.,R,...,,2,5,35,1,Slider,0,2,Infield shift,Standard
5,SL,9/16/2022,"Verlander, Justin",681146,434378,field_out,hit_into_play,4,Jonah Bride pops out softly to first baseman Y...,R,...,,1,5,34,5,Slider,0,2,Standard,Standard
6,SL,9/16/2022,"Verlander, Justin",681146,434378,,called_strike,6,Jonah Bride pops out softly to first baseman Y...,R,...,,1,5,34,4,Slider,0,2,Strategic,Standard
7,FF,9/16/2022,"Verlander, Justin",681146,434378,,foul,11,Jonah Bride pops out softly to first baseman Y...,R,...,,1,5,34,3,4-Seam Fastball,0,2,Standard,Standard
8,SL,9/16/2022,"Verlander, Justin",681146,434378,,ball,14,Jonah Bride pops out softly to first baseman Y...,R,...,,1,5,34,2,Slider,0,2,Standard,Standard
9,CU,9/16/2022,"Verlander, Justin",681146,434378,,ball,13,Jonah Bride pops out softly to first baseman Y...,R,...,,1,5,34,1,Curveball,0,2,Standard,Standard


In [3]:
# Drop extra columns
verlander_df = verlander_df.drop(columns=['player_name', 'batter', 'pitcher', 'events', 'des', 'home_team', 'away_team', 'description', 'hit_location', 'bb_type', 'at_bat_number', 'pitch_name'])

display(verlander_df.head(20))

Unnamed: 0,pitch_type,game_date,zone,stand,p_throws,type,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,inning,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment
0,SL,9/16/2022,9,R,R,S,1,2,,,,2,5,5,0,2,Infield shift,Standard
1,FF,9/16/2022,3,R,R,S,1,2,,,,2,5,4,0,2,Infield shift,Standard
2,SL,9/16/2022,13,R,R,S,1,1,,,,2,5,3,0,2,Infield shift,Standard
3,FF,9/16/2022,1,R,R,S,1,0,,,,2,5,2,0,2,Infield shift,Standard
4,SL,9/16/2022,8,R,R,B,0,0,,,,2,5,1,0,2,Infield shift,Standard
5,SL,9/16/2022,4,R,R,X,2,2,,,,1,5,5,0,2,Standard,Standard
6,SL,9/16/2022,6,R,R,S,2,1,,,,1,5,4,0,2,Strategic,Standard
7,FF,9/16/2022,11,R,R,S,2,0,,,,1,5,3,0,2,Standard,Standard
8,SL,9/16/2022,14,R,R,B,1,0,,,,1,5,2,0,2,Standard,Standard
9,CU,9/16/2022,13,R,R,B,0,0,,,,1,5,1,0,2,Standard,Standard


In [4]:
# Change all NaN in the on base columns to 0.0
on_base_df = verlander_df[['on_3b', 'on_2b', 'on_1b']]
on_base_df = on_base_df.fillna(0)
on_base_df.head(25)

Unnamed: 0,on_3b,on_2b,on_1b
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


In [5]:
# Change player ID numbers for the on base dataframe to 1.0
on_base_df.on_1b[on_base_df.on_1b > 0] = 1.0
on_base_df.on_2b[on_base_df.on_2b > 0] = 1.0
on_base_df.on_3b[on_base_df.on_3b > 0] = 1.0

on_base_df[225:255]

Unnamed: 0,on_3b,on_2b,on_1b
225,0.0,0.0,0.0
226,0.0,0.0,0.0
227,0.0,0.0,0.0
228,0.0,0.0,0.0
229,0.0,0.0,0.0
230,1.0,0.0,0.0
231,1.0,0.0,0.0
232,1.0,0.0,0.0
233,1.0,0.0,0.0
234,1.0,0.0,0.0


In [6]:
# Drop the original on base columns 
verlander_df = verlander_df.drop(columns = ['on_3b', 'on_2b', 'on_1b'])
verlander_df.head()

Unnamed: 0,pitch_type,game_date,zone,stand,p_throws,type,balls,strikes,outs_when_up,inning,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment
0,SL,9/16/2022,9,R,R,S,1,2,2,5,5,0,2,Infield shift,Standard
1,FF,9/16/2022,3,R,R,S,1,2,2,5,4,0,2,Infield shift,Standard
2,SL,9/16/2022,13,R,R,S,1,1,2,5,3,0,2,Infield shift,Standard
3,FF,9/16/2022,1,R,R,S,1,0,2,5,2,0,2,Infield shift,Standard
4,SL,9/16/2022,8,R,R,B,0,0,2,5,1,0,2,Infield shift,Standard


In [7]:
# Concat the new on base columns to the verlander df
verlander_df = pd.concat([verlander_df, on_base_df], axis = 1)

verlander_df[225: 255]

Unnamed: 0,pitch_type,game_date,zone,stand,p_throws,type,balls,strikes,outs_when_up,inning,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,on_3b,on_2b,on_1b
225,FF,8/23/2022,13,R,R,S,1,0,1,1,2,0,0,Standard,Standard,0.0,0.0,0.0
226,CU,8/23/2022,14,R,R,B,0,0,1,1,1,0,0,Standard,Standard,0.0,0.0,0.0
227,SL,8/23/2022,3,L,R,X,0,2,0,1,3,0,0,Standard,Standard,0.0,0.0,0.0
228,CU,8/23/2022,8,L,R,S,0,1,0,1,2,0,0,Standard,Standard,0.0,0.0,0.0
229,FF,8/23/2022,8,L,R,S,0,0,0,1,1,0,0,Standard,Standard,0.0,0.0,0.0
230,SL,8/16/2022,14,R,R,X,0,2,2,7,4,3,3,Standard,Standard,1.0,0.0,0.0
231,FF,8/16/2022,11,R,R,S,0,2,2,7,3,3,3,Standard,Standard,1.0,0.0,0.0
232,FF,8/16/2022,11,R,R,S,0,1,2,7,2,3,3,Standard,Standard,1.0,0.0,0.0
233,SL,8/16/2022,12,R,R,S,0,0,2,7,1,3,3,Standard,Standard,1.0,0.0,0.0
234,FF,8/16/2022,11,R,R,S,2,2,1,7,5,3,3,Infield shift,Standard,1.0,0.0,0.0


In [8]:
# Split target columns from features
y = verlander_df['pitch_type']
X = verlander_df.drop(columns = 'pitch_type')

In [9]:
# See which variables are categorical 
X.dtypes

game_date                 object
zone                       int64
stand                     object
p_throws                  object
type                      object
balls                      int64
strikes                    int64
outs_when_up               int64
inning                     int64
pitch_number               int64
bat_score                  int64
fld_score                  int64
if_fielding_alignment     object
of_fielding_alignment     object
on_3b                    float64
on_2b                    float64
on_1b                    float64
dtype: object

In [10]:
# Create a list of categorical variables
categorical_variables = list(X.dtypes[X.dtypes == 'object'].index)
categorical_variables.remove('game_date')

# Display the categorical variables list
categorical_variables

['stand', 'p_throws', 'type', 'if_fielding_alignment', 'of_fielding_alignment']

In [17]:
# Change categorical variables using OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse = False)

encoded_data = enc.fit_transform(X[categorical_variables])

encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

numerical_variables = X.drop(columns = categorical_variables)

X_encoded_df = pd.concat([encoded_df, numerical_variables], axis = 1)

X_encoded_df.head()



Unnamed: 0,stand_L,stand_R,p_throws_R,type_B,type_S,type_X,if_fielding_alignment_Infield shift,if_fielding_alignment_Standard,if_fielding_alignment_Strategic,of_fielding_alignment_Standard,...,balls,strikes,outs_when_up,inning,pitch_number,bat_score,fld_score,on_3b,on_2b,on_1b
0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1,2,2,5,5,0,2,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1,2,2,5,4,0,2,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1,1,2,5,3,0,2,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1,0,2,5,2,0,2,0.0,0.0,0.0
4,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0,0,2,5,1,0,2,0.0,0.0,0.0


In [20]:
X_encoded_df = X_encoded_df.drop(columns = 'game_date')

X_encoded_df.columns

Index(['stand_L', 'stand_R', 'p_throws_R', 'type_B', 'type_S', 'type_X',
       'if_fielding_alignment_Infield shift', 'if_fielding_alignment_Standard',
       'if_fielding_alignment_Strategic', 'of_fielding_alignment_Standard',
       'of_fielding_alignment_Strategic', 'zone', 'balls', 'strikes',
       'outs_when_up', 'inning', 'pitch_number', 'bat_score', 'fld_score',
       'on_3b', 'on_2b', 'on_1b'],
      dtype='object')

In [21]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded_df, y, random_state=1)

In [22]:
# Scaling Data
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)