In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import tree

In [2]:
#Read training data (manually deleted rows 4296, 6420, 8516 - bad data)
df = pd.read_csv(r'C:\Users\Rick Ahlf\Downloads\train.csv')

In [3]:
#pitch_id is not included, mainly a surrogate index
features = ['pitcher', 'batter_side', 'inning', 'half',
   'outs', 'balls', 'strikes', 'x55', 'y55', 'z55', 'vx55', 'vy55', 'vz55',
   'ax', 'ay', 'az', 'pitch_speed', 'release_x', 'release_z',
   'release_angle_x', 'release_angle_z', 'extension', 'break_x', 'break_z',
   'induced_break_z', 'spin_rate', 'spin_axis', 'pfx_x', 'pfx_z',
   'pfx_xLONG', 'pfx_zLONG', 'approach_angle_x', 'approach_angle_z',
   'plate_x', 'plate_z']

In [4]:
#Convert Left/Right to 1/0
df['pitcher_side'] = np.where(df['pitcher_side']=='Left', 1, 0)
df['batter_side'] = np.where(df['batter_side']=='Left', 1, 0)

#Convert pitcher from categorical to numeric
le = LabelEncoder()
df['pitcher'] = le.fit_transform(df['pitcher'])

In [5]:
data = {'PCA': [], 'noPCA': []}

#Split into training and test sets
train_x, test_x, train_y, test_y = train_test_split(df.loc[:, features], df['type'], test_size=0.2, random_state=0)

#Standardize data set to unit scale (mean = 0 and variance = 1)
scaler = StandardScaler()

#Fit on training set only
scaler.fit(train_x)

#Apply transform to both the training set and the test set
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)
    
#Decision tree classifier
clf = tree.DecisionTreeClassifier(random_state=0)

#Logistic regression model
logisticRegr = LogisticRegression(solver='lbfgs')

#Linear SVM
linearSVM = LinearSVC(random_state=0)

#Neural network
mlp = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)

#Apply PCA
pca = PCA(n_components=16)
pca.fit(train_x, train_y)

models = [('Decision Tree Classifier', clf), ('Logistic Regression', logisticRegr),
    ('Linear SVM', linearSVM), ('Neural Network', mlp)]

#Loop through models
for model in models:

    #Evaluate pipeline
    pipe = Pipeline([('pca', pca), model])
    pipe.fit(train_x, train_y)
    predictions = pipe.predict(test_x)

    #Evaluate results
    data['PCA'].append(pipe.score(test_x, test_y))

    #Evaluate model
    current_model = model[1]
    current_model.fit(train_x, train_y)
    predictions = current_model.predict(test_x)

    #Evaluate results
    data['noPCA'].append(current_model.score(test_x, test_y))
    
results = pd.DataFrame.from_dict(data, columns=[i[0] for i in models], orient='index')

print(results)


       Decision Tree Classifier  Logistic Regression  Linear SVM  \
PCA                    0.858263             0.839776    0.836975   
noPCA                  0.913445             0.873950    0.877591   

       Neural Network  
PCA          0.933053  
noPCA        0.937535  


In [6]:
data = {'LHP': {'PCA': [], 'noPCA': []}, 'RHP': {'PCA': [], 'noPCA': []}}

for p in df['pitcher_side'].unique():

    df_p = df[df['pitcher_side']==p].reset_index()

    #Split into training and test sets
    train_x, test_x, train_y, test_y = train_test_split(df_p.loc[:, features], df_p['type'], test_size=0.2, random_state=0)

    #Standardize data set to unit scale (mean = 0 and variance = 1)
    scaler = StandardScaler()

    #Fit on training set only
    scaler.fit(train_x)

    #Apply transform to both the training set and the test set
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    #Decision tree classifier
    clf = tree.DecisionTreeClassifier(random_state=0)

    #Logistic regression model
    logisticRegr = LogisticRegression(solver='lbfgs')

    #Linear SVM
    linearSVM = LinearSVC(random_state=0)

    #Neural network
    mlp = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)

    #Apply PCA
    pca = PCA(n_components=16)
    pca.fit(train_x, train_y)
    
    models = [('Decision Tree Classifier', clf), ('Logistic Regression', logisticRegr),
          ('Linear SVM', linearSVM), ('Neural Network', mlp)]

    #Loop through models
    for model in models:

        #Evaluate PCA pipeline
        pipe = Pipeline([('pca', pca), model])
        pipe.fit(train_x, train_y)
        predictions = pipe.predict(test_x)
                
        #Store results
        data['LHP' if p==1 else 'RHP']['PCA'].append(pipe.score(test_x, test_y))
        
        #Evaluate no-PCA model
        current_model = model[1]
        current_model.fit(train_x, train_y)
        predictions = current_model.predict(test_x)

        #Store results
        data['LHP' if p==1 else 'RHP']['noPCA'].append(current_model.score(test_x, test_y))

reform = {(p, pca): values for p, scores in data.items() for pca, values in scores.items()}

results = pd.DataFrame.from_dict(reform, columns=[i[0] for i in models], orient='index')

print(results)


              Decision Tree Classifier  Logistic Regression  Linear SVM  \
(LHP, PCA)                    0.878359             0.915134    0.915134   
(LHP, noPCA)                  0.909477             0.930693    0.933522   
(RHP, PCA)                    0.860286             0.896612    0.893119   
(RHP, noPCA)                  0.910933             0.907440    0.907789   

              Neural Network  
(LHP, PCA)          0.937765  
(LHP, noPCA)        0.927864  
(RHP, PCA)          0.939923  
(RHP, noPCA)        0.935033  


In [7]:
data = {p: [] for p in df['pitcher'].unique()}

#individual pitchers removed from features to train individual pitcher models
features = ['batter_side', 'inning', 'half',
   'outs', 'balls', 'strikes', 'x55', 'y55', 'z55', 'vx55', 'vy55', 'vz55',
   'ax', 'ay', 'az', 'pitch_speed', 'release_x', 'release_z',
   'release_angle_x', 'release_angle_z', 'extension', 'break_x', 'break_z',
   'induced_break_z', 'spin_rate', 'spin_axis', 'pfx_x', 'pfx_z',
   'pfx_xLONG', 'pfx_zLONG', 'approach_angle_x', 'approach_angle_z',
   'plate_x', 'plate_z']

for p in df['pitcher'].unique():

    df_p = df[df['pitcher']==p].reset_index()

    #Split into training and test sets
    train_x, test_x, train_y, test_y = train_test_split(df_p.loc[:, features], df_p['type'], test_size=0.2, random_state=0)

    #Standardize data set to unit scale (mean = 0 and variance = 1)
    scaler = StandardScaler()

    #Fit on training set only
    scaler.fit(train_x)

    #Apply transform to both the training set and the test set
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)
        
    #Ensure multi-class
    if len(train_y.unique())<2:
        print('Pitcher', p, 'only has a single pitch type available!')
        data[p] = [1.0,1.0,1.0,1.0]
        continue

    #Decision tree classifier
    clf = tree.DecisionTreeClassifier(random_state=0)

    #Logistic regression model
    logisticRegr = LogisticRegression(solver='lbfgs')

    #Linear SVM
    linearSVM = LinearSVC(random_state=0)

    #Neural network
    mlp = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)
    
    #Loop through models
    for model in [('Decision Tree Classifier', clf), ('Logistic Regression', logisticRegr),
                  ('Linear SVM', linearSVM), ('Neural Network', mlp)]:
        
        #Evaluate model
        current_model = model[1]
        current_model.fit(train_x, train_y)
        predictions = current_model.predict(test_x)
        
        #Store results
        data[p].append(current_model.score(test_x, test_y))

results = pd.DataFrame.from_dict(data, columns=[i[0] for i in models], orient='index').sort_index().rename_axis('Pitcher')

print(results)

print('\nAverage Accuracy')
print(results.mean())

Pitcher 19 only has a single pitch type available!
         Decision Tree Classifier  Logistic Regression  Linear SVM  \
Pitcher                                                              
0                        0.900000             0.933333    0.933333   
1                        0.727273             0.909091    0.909091   
2                        0.666667             1.000000    1.000000   
3                        1.000000             1.000000    1.000000   
4                        0.857143             1.000000    1.000000   
5                        0.964912             0.982456    0.982456   
6                        0.818182             0.909091    0.818182   
7                        1.000000             1.000000    1.000000   
8                        0.900000             1.000000    1.000000   
9                        0.806306             0.837838    0.842342   
10                       0.980000             0.980000    1.000000   
11                       0.955882      