In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import tree

In [2]:
#Read training data (manually deleted rows 4296, 6420, 8516 - bad data)
df = pd.read_csv(r'C:\Users\Rick Ahlf\Downloads\train.csv')

In [3]:
#pitch_id is not included, mainly a surrogate index
features = ['pitcher', 'batter_side', 'inning', 'half',
   'outs', 'balls', 'strikes', 'x55', 'y55', 'z55', 'vx55', 'vy55', 'vz55',
   'ax', 'ay', 'az', 'pitch_speed', 'release_x', 'release_z',
   'release_angle_x', 'release_angle_z', 'extension', 'break_x', 'break_z',
   'induced_break_z', 'spin_rate', 'spin_axis', 'pfx_x', 'pfx_z',
   'pfx_xLONG', 'pfx_zLONG', 'approach_angle_x', 'approach_angle_z',
   'plate_x', 'plate_z']

In [4]:
#Convert Left/Right to 1/0
df['pitcher_side'] = np.where(df['pitcher_side']=='Left', 1, 0)
df['batter_side'] = np.where(df['batter_side']=='Left', 1, 0)

#Convert pitcher from categorical to numeric
le = LabelEncoder()
df['pitcher'] = le.fit_transform(df['pitcher'])

In [5]:
#Split into training and test sets
train_x, test_x, train_y, test_y = train_test_split(df.loc[:, features], df['type'], test_size=0.2, random_state=0)

#Standardize data set to unit scale (mean = 0 and variance = 1)
scaler = StandardScaler()

#Fit on training set only
scaler.fit(train_x)

#Apply transform to both the training set and the test set
train_x = scaler.transform(train_x)
test_x = scaler.transform(test_x)
    
#Decision tree classifier
clf = tree.DecisionTreeClassifier(random_state=0)

#Logistic regression model
logisticRegr = LogisticRegression(solver='lbfgs')

#Linear SVM
linearSVM = LinearSVC(random_state=0)

#Neural network
mlp = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)

#Apply PCA
pca = PCA(n_components=16)
pca.fit(train_x, train_y)

print('All')

#Loop through models
for model in [('Decision Tree Classifier', clf), ('Logistic Regression', logisticRegr),
              ('Linear SVM', linearSVM), ('Neural Network', mlp)]:

    #Evaluate pipeline
    pipe = Pipeline([('pca', pca), model])
    pipe.fit(train_x, train_y)
    predictions = pipe.predict(test_x)

    #Evaluate results
    print(model[0], 'w/ PCA -->', pipe.score(test_x, test_y))

    #Evaluate model
    current_model = model[1]
    current_model.fit(train_x, train_y)
    predictions = current_model.predict(test_x)

    #Evaluate results
    print(model[0], 'w/o PCA -->', current_model.score(test_x, test_y))


All
Decision Tree Classifier w/ PCA --> 0.858263305322
Decision Tree Classifier w/o PCA --> 0.913445378151
Logistic Regression w/ PCA --> 0.839775910364
Logistic Regression w/o PCA --> 0.873949579832
Linear SVM w/ PCA --> 0.836974789916
Linear SVM w/o PCA --> 0.877591036415
Neural Network w/ PCA --> 0.932212885154
Neural Network w/o PCA --> 0.934453781513


In [6]:
for p in df['pitcher_side'].unique():

    df_p = df[df['pitcher_side']==p].reset_index()

    #Split into training and test sets
    train_x, test_x, train_y, test_y = train_test_split(df_p.loc[:, features], df_p['type'], test_size=0.2, random_state=0)

    #Standardize data set to unit scale (mean = 0 and variance = 1)
    scaler = StandardScaler()

    #Fit on training set only
    scaler.fit(train_x)

    #Apply transform to both the training set and the test set
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    #Decision tree classifier
    clf = tree.DecisionTreeClassifier(random_state=0)

    #Logistic regression model
    logisticRegr = LogisticRegression(solver='lbfgs')

    #Linear SVM
    linearSVM = LinearSVC(random_state=0)

    #Neural network
    mlp = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)

    #Apply PCA
    pca = PCA(n_components=16)
    pca.fit(train_x, train_y)
    
    print('LHP' if p==1 else 'RHP')

    #Loop through models
    for model in [('Decision Tree Classifier', clf), ('Logistic Regression', logisticRegr),
                  ('Linear SVM', linearSVM), ('Neural Network', mlp)]:

        #Evaluate pipeline
        pipe = Pipeline([('pca', pca), model])
        pipe.fit(train_x, train_y)
        predictions = pipe.predict(test_x)

        #Evaluate results
        print(model[0], 'w/ PCA -->', pipe.score(test_x, test_y))
        
        #Evaluate model
        current_model = model[1]
        current_model.fit(train_x, train_y)
        predictions = current_model.predict(test_x)
        
        #Evaluate results
        print(model[0], 'w/o PCA -->', current_model.score(test_x, test_y))
        

RHP
Decision Tree Classifier w/ PCA --> 0.860286412854
Decision Tree Classifier w/o PCA --> 0.910932588194
Logistic Regression w/ PCA --> 0.896611945512
Logistic Regression w/o PCA --> 0.907439748516
Linear SVM w/ PCA --> 0.893119105833
Linear SVM w/o PCA --> 0.907789032483
Neural Network w/ PCA --> 0.935731749913
Neural Network w/o PCA --> 0.933286762138
LHP
Decision Tree Classifier w/ PCA --> 0.878359264498
Decision Tree Classifier w/o PCA --> 0.909476661952
Logistic Regression w/ PCA --> 0.91513437058
Logistic Regression w/o PCA --> 0.930693069307
Linear SVM w/ PCA --> 0.91513437058
Linear SVM w/o PCA --> 0.933521923621
Neural Network w/ PCA --> 0.926449787836
Neural Network w/o PCA --> 0.94342291372


In [7]:
#pitch_id is not included, mainly a surrogate index
features = ['batter_side', 'inning', 'half',
   'outs', 'balls', 'strikes', 'x55', 'y55', 'z55', 'vx55', 'vy55', 'vz55',
   'ax', 'ay', 'az', 'pitch_speed', 'release_x', 'release_z',
   'release_angle_x', 'release_angle_z', 'extension', 'break_x', 'break_z',
   'induced_break_z', 'spin_rate', 'spin_axis', 'pfx_x', 'pfx_z',
   'pfx_xLONG', 'pfx_zLONG', 'approach_angle_x', 'approach_angle_z',
   'plate_x', 'plate_z']

for p in df['pitcher'].unique():

    df_p = df[df['pitcher']==p].reset_index()

    #Split into training and test sets
    train_x, test_x, train_y, test_y = train_test_split(df_p.loc[:, features], df_p['type'], test_size=0.2, random_state=0)

    #Standardize data set to unit scale (mean = 0 and variance = 1)
    scaler = StandardScaler()

    #Fit on training set only
    scaler.fit(train_x)

    #Apply transform to both the training set and the test set
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)
    
    #Ensure multi-class
    if len(train_y.unique())<2:
        print('Pitcher', p, 'only has a single pitch type available!')
        continue

    #Decision tree classifier
    clf = tree.DecisionTreeClassifier(random_state=0)

    #Logistic regression model
    logisticRegr = LogisticRegression(solver='lbfgs')

    #Linear SVM
    linearSVM = LinearSVC(random_state=0)

    #Neural network
    mlp = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)
    
    print(p)

    #Loop through models
    for model in [('Decision Tree Classifier', clf), ('Logistic Regression', logisticRegr),
                  ('Linear SVM', linearSVM), ('Neural Network', mlp)]:
        
        #Evaluate model
        current_model = model[1]
        current_model.fit(train_x, train_y)
        predictions = current_model.predict(test_x)
        
        #Evaluate results
        print(model[0], current_model.score(test_x, test_y))

31
Decision Tree Classifier 0.993333333333
Logistic Regression 1.0
Linear SVM 0.993333333333
Neural Network 0.993333333333
34
Decision Tree Classifier 0.974276527331
Logistic Regression 0.993569131833
Linear SVM 0.993569131833
Neural Network 0.990353697749
23
Decision Tree Classifier 0.86013986014
Logistic Regression 0.874125874126
Linear SVM 0.888111888112
Neural Network 0.867132867133
20
Decision Tree Classifier 0.937823834197
Logistic Regression 0.968911917098
Linear SVM 0.968911917098
Neural Network 0.968911917098
18
Decision Tree Classifier 0.98
Logistic Regression 1.0
Linear SVM 1.0
Neural Network 1.0
12
Decision Tree Classifier 0.974683544304
Logistic Regression 0.981012658228
Linear SVM 0.993670886076
Neural Network 0.974683544304
14
Decision Tree Classifier 0.939209726444
Logistic Regression 0.960486322188
Linear SVM 0.966565349544
Neural Network 0.960486322188
0
Decision Tree Classifier 0.9
Logistic Regression 0.933333333333
Linear SVM 0.933333333333
Neural Network 0.90833333