In [7]:
#Necessary Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np

AttributeError: type object 'sklearn.utils.seq_dataset.SequentialDataset64' has no attribute '__reduce_cython__'

In [None]:
#import the data
data = pd.read_csv('2020-train.csv')
df = pd.read_csv('2020-test.csv')

In [None]:
#Create Mapping to numerals for mapping into the model as well as color coding for graph
sMapping = {'InPlay': 1, 'BallCalled': 0, 'StrikeCalled' : 1, 'StrikeSwinging' : 1, 'FoulBall': 1, 'HitByPitch': 0, 'BallIntentional': 0}
cMapping = {'InPlay': 'black', 'BallCalled': 'blue', 'StrikeCalled' : 'red', 'StrikeSwinging' : 'orange', 'FoulBall': 'black', 'HitByPitch': 'purple', 'BallIntentional': 'green'}
sides = {'Right' : 0, 'Left': 1}

In [None]:
#create a second df for the edited version and replace blanks or infinites with placeholder figure for modelling
data2 = data.replace([np.inf, -np.inf], np.nan)

#map out the numerics for handedness
data2.pitcher_side = list(data2['pitcher_side'].map(sides))
data2.batter_side = list(data2['batter_side'].map(sides))

# fill nas with placeholder figure for modelling
data2 = data2.fillna(999)

In [None]:
## Note that, with a larger, ongoing project, I would attempt to impute the missing values from available data
## There are some for catcher/umpire value '9c6cbb5e' where no real imputing could be done due to what would appear to be equipment errors.
## For the others, a K nearest neighbor filler would be an effective strategy to determine values for these values.
## I would attempt to find similar pitches and replace the missing values with averages from similar pitches.
## This would likely improve the accuracy of the model.

In [None]:
#add is_strike column
data2['is_strike'] = list(data2['pitch_call'].map(sMapping))

In [None]:
#Plot the pitches and color code by call
plt.scatter(data2.plate_side, data2.plate_height, c=list(data2['pitch_call'].map(cMapping)))

#Create patches which can be imported to label the legend
inPlay = mpatches.Patch(color='black',label='In Play')
Ball = mpatches.Patch(color='blue', label='Ball Called')
calledStrike = mpatches.Patch(color='red',label='Strike Called')
swingStrike = mpatches.Patch(color='orange', label='Strike Swinging')
foulBall = mpatches.Patch(color='black',label='Foul Ball')
HBP = mpatches.Patch(color='purple', label='Hit By Pitch')
intBall = mpatches.Patch(color='green',label='Intentional Ball')

#Use the patches to create a legend, add title, set limits
plt.xlim((-10,10))
plt.ylim((-7.5,15))
plt.legend(handles=[inPlay, Ball, calledStrike, swingStrike, foulBall, HBP, intBall], prop={'size': 8})
plt.title("Strike Zone")

#side note - This graph turned out to be basically useless but I still like to graph my data to make sure it doesn't look wrong

In [None]:
#put all of the potentially relevant data points into a test and train series
X = data2[['pitcher_side', 'batter_side', 'outs', 'balls', 'strikes',
       'release_speed', 'vert_release_angle', 'horz_release_angle',
       'spin_rate', 'spin_axis', 'rel_height', 'rel_side', 'extension',
       'vert_break', 'induced_vert_break', 'horz_break', 'plate_height',
       'plate_side', 'zone_speed', 'vert_approach_angle',
       'horz_approach_angle', 'zone_time', 'x55', 'z55']]
y = data2.is_strike
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=116)

In [None]:
#load a random forest classifier to model the outcomes
rf = RandomForestClassifier()

In [None]:
#fit the model to the training data
rf = rf.fit(X_train, y_train)
#display the scores in and out of sample
print(f'{round(rf.score(X_train, y_train),4)* 100} % in sample.')
print(f'{round(rf.score(X_test, y_test),4)*100} % out of sample.')

In [None]:
#Examine the feature importance for the random forest model
[(x[0], x[1]) for x in zip(rf.feature_importances_, X.columns)]

In [None]:
#create another column for the predictions made by the model
data2['Predictions'] = rf.predict(X)

In [None]:
#Create a logistic regression model to see if it outperforms the random forest regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
#Print prediction accuracy based on logistic regression model
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(logreg.score(X_train, y_train)))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
# print the classification report for the logistic regression
print(classification_report(y_test, y_pred))

In [None]:
#complete the same preprocessing for the test data to input into the model
df2 = df.replace([np.inf, -np.inf], np.nan)
df2.pitcher_side = list(df['pitcher_side'].map(sides))
df2.batter_side = list(df['batter_side'].map(sides))
df2 = df2.fillna(999)

In [None]:
X2 = df[['pitcher_side', 'batter_side', 'outs', 'balls', 'strikes',
       'release_speed', 'vert_release_angle', 'horz_release_angle',
       'spin_rate', 'spin_axis', 'rel_height', 'rel_side', 'extension',
       'vert_break', 'induced_vert_break', 'horz_break', 'plate_height',
       'plate_side', 'zone_speed', 'vert_approach_angle',
       'horz_approach_angle', 'zone_time', 'x55', 'z55']]

In [None]:
df.is_strike = rf.predict(X2)