## Imports and Data Preprocessing

Import data and do basics of removing extraneous data

In [1]:
# Initial imports
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [2]:
# Import Verlander dataset 

verlander_df = pd.read_csv(Path("../resources/verlander_update.csv"),
                            index_col='game_date',
                            parse_dates=True,
                            infer_datetime_format=True)

display(verlander_df.head())

Unnamed: 0_level_0,pitch_type,player_name,batter,pitcher,events,description,zone,des,stand,p_throws,...,on_1b,outs_when_up,inning,at_bat_number,pitch_number,pitch_name,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-09-16,SL,"Verlander, Justin",669127,434378,strikeout,swinging_strike,9,Shea Langeliers strikes out swinging.,R,R,...,,2,5,35,5,Slider,0,2,Infield shift,Standard
2022-09-16,FF,"Verlander, Justin",669127,434378,,foul,3,Shea Langeliers strikes out swinging.,R,R,...,,2,5,35,4,4-Seam Fastball,0,2,Infield shift,Standard
2022-09-16,SL,"Verlander, Justin",669127,434378,,called_strike,13,Shea Langeliers strikes out swinging.,R,R,...,,2,5,35,3,Slider,0,2,Infield shift,Standard
2022-09-16,FF,"Verlander, Justin",669127,434378,,foul,1,Shea Langeliers strikes out swinging.,R,R,...,,2,5,35,2,4-Seam Fastball,0,2,Infield shift,Standard
2022-09-16,SL,"Verlander, Justin",669127,434378,,ball,8,Shea Langeliers strikes out swinging.,R,R,...,,2,5,35,1,Slider,0,2,Infield shift,Standard


In [3]:
# Clean dataset 

verlander_df = verlander_df.drop(columns=['des', 
'pitch_name', 
'at_bat_number', 
'inning', 
'zone', 
'type', 
'player_name', 
'batter', 
'pitcher', 
'events',
'bb_type',
'hit_location',
'description',
'home_team',
'away_team'])

display(verlander_df.head())

Unnamed: 0_level_0,pitch_type,stand,p_throws,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-09-16,SL,R,R,1,2,,,,2,5,0,2,Infield shift,Standard
2022-09-16,FF,R,R,1,2,,,,2,4,0,2,Infield shift,Standard
2022-09-16,SL,R,R,1,1,,,,2,3,0,2,Infield shift,Standard
2022-09-16,FF,R,R,1,0,,,,2,2,0,2,Infield shift,Standard
2022-09-16,SL,R,R,0,0,,,,2,1,0,2,Infield shift,Standard


In [10]:
# Encode 1st, 2nd and 3rd bases with 1s and 0s

# Fill NaN to 0 
verlander_df['on_3b'] = verlander_df['on_3b'].fillna(0)
verlander_df['on_2b'] = verlander_df['on_2b'].fillna(0)
verlander_df['on_1b'] = verlander_df['on_1b'].fillna(0)

# Change batter IDs to 1 
verlander_df['on_3b'][verlander_df['on_3b'] > 0.0] = 1.0
verlander_df['on_2b'][verlander_df['on_2b'] > 0.0] = 1.0
verlander_df['on_1b'][verlander_df['on_1b'] > 0.0] = 1.0

display(verlander_df[1000:1050])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  verlander_df['on_3b'][verlander_df['on_3b'] > 0.0] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  verlander_df['on_2b'][verlander_df['on_2b'] > 0.0] = 1.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  verlander_df['on_1b'][verlander_df['on_1b'] > 0.0] = 1.0


Unnamed: 0_level_0,pitch_type,stand,p_throws,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-06-29,FF,L,R,0,1,0.0,0.0,0.0,0,2,0,0,Standard,Standard
2022-06-29,FF,L,R,0,0,0.0,0.0,0.0,0,1,0,0,Standard,Standard
2022-06-29,CH,L,R,3,2,1.0,0.0,1.0,2,9,0,0,Standard,Standard
2022-06-29,FF,L,R,3,2,1.0,0.0,1.0,2,8,0,0,Standard,Standard
2022-06-29,SL,L,R,3,2,1.0,0.0,1.0,2,7,0,0,Standard,Standard
2022-06-29,FF,L,R,2,2,1.0,0.0,1.0,2,6,0,0,Standard,Standard
2022-06-29,FF,L,R,2,2,1.0,0.0,1.0,2,5,0,0,Standard,Standard
2022-06-29,FF,L,R,2,1,1.0,0.0,1.0,2,4,0,0,Standard,Standard
2022-06-29,FF,L,R,2,0,1.0,0.0,1.0,2,3,0,0,Standard,Standard
2022-06-29,SL,L,R,1,0,1.0,0.0,1.0,2,2,0,0,Standard,Standard


## Data Processing 

Prepare data to be fed into model

In [5]:
# Split into X and y 

X = verlander_df.drop(columns='pitch_type')
y= verlander_df['pitch_type']

In [11]:
# Use get_dummies to encode categorical variables 

X = pd.get_dummies(X)

display(X.head())


Unnamed: 0_level_0,balls,strikes,on_3b,on_2b,on_1b,outs_when_up,pitch_number,bat_score,fld_score,stand_L,stand_R,p_throws_R,if_fielding_alignment_Infield shift,if_fielding_alignment_Standard,if_fielding_alignment_Strategic,of_fielding_alignment_Standard,of_fielding_alignment_Strategic
game_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-09-16,1,2,0.0,0.0,0.0,2,5,0,2,0,1,1,1,0,0,1,0
2022-09-16,1,2,0.0,0.0,0.0,2,4,0,2,0,1,1,1,0,0,1,0
2022-09-16,1,1,0.0,0.0,0.0,2,3,0,2,0,1,1,1,0,0,1,0
2022-09-16,1,0,0.0,0.0,0.0,2,2,0,2,0,1,1,1,0,0,1,0
2022-09-16,0,0,0.0,0.0,0.0,2,1,0,2,0,1,1,1,0,0,1,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Deep Learning/Neural Network Architecture

Using the method from the module, the following basis is used to design the first iteration of the neural network: 

*the mean of the number of input features and the number of neurons in the output layer ((number of input features + number of neurons in output layer) / 2). Use a number close to this mean for the number of neurons in the first hidden layer. Repeat this pattern for subsequent hidden layers ((number of neurons in the prior hidden layer + number of neurons in output layer) / 2). Softmax is the activation for the output layer that is used for multi-class classification. Categorial cross entropy and predictive model accuracy are respectively the loss functions and metrics used for multi-class classification*

In [8]:
# Initialize the Deep Learning Neural Network model

nn_v0 = Sequential()

2022-09-22 19:36:17.993889: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# Design the network architecture 

# Define the model - deep neural net
number_input_features = len(verlander_df.columns)
number_output = 1

# Define hidden layers
i = 0
hidden_nodes_layer=(number_input_features+number_output)/2
while hidden_nodes_layer > 1: 
    if i == 0:
        nn_v0.add(Dense(units=round(hidden_nodes_layer), input_dim=number_input_features, activation='relu'))
        i+=1
    else:
        hidden_nodes_layer = hidden_nodes_layer/2
        nn_v0.add(Dense(units=round(hidden_nodes_layer), activation='relu'))
        i+=1

# Define output layer
nn_v0.add(Dense(units=number_output, activation='softmax'))

# Compile the model
nn_v0.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

display(len(verlander_df.columns))
display(nn_v0.summary())

14

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 120       
                                                                 
 dense_1 (Dense)             (None, 4)                 36        
                                                                 
 dense_2 (Dense)             (None, 2)                 10        
                                                                 
 dense_3 (Dense)             (None, 1)                 3         
                                                                 
 dense_4 (Dense)             (None, 1)                 2         
                                                                 
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________


None

In [None]:
# Fit the data to the model

model_v0 = nn_v0.fit(X, y, )