In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

#Read training data (manually deleted rows 4296, 6420, 8516 - bad data)
df = pd.read_csv(r'C:\Users\Rick Ahlf\Downloads\train.csv')

#pitch_id is not included, mainly a surrogate index
features = ['pitcher', 'batter_side', 'inning', 'half',
   'outs', 'balls', 'strikes', 'x55', 'y55', 'z55', 'vx55', 'vy55', 'vz55',
   'ax', 'ay', 'az', 'pitch_speed', 'release_x', 'release_z',
   'release_angle_x', 'release_angle_z', 'extension', 'break_x', 'break_z',
   'induced_break_z', 'spin_rate', 'spin_axis', 'pfx_x', 'pfx_z',
   'pfx_xLONG', 'pfx_zLONG', 'approach_angle_x', 'approach_angle_z',
   'plate_x', 'plate_z']

#Convert Left/Right to 1/0
df['pitcher_side'] = np.where(df['pitcher_side']=='Left', 1, 0)
df['batter_side'] = np.where(df['batter_side']=='Left', 1, 0)

#Convert pitcher from categorical to numeric
le = LabelEncoder()
df['pitcher'] = le.fit_transform(df['pitcher'])

for p in df['pitcher_side'].unique():

    df_p = df[df['pitcher_side']==p].reset_index()

    #Split into training and test sets
    train_x, test_x, train_y, test_y = train_test_split(df_p.loc[:, features], df_p['type'], test_size=0.2, random_state=0)

    #Standardize data set to unit scale (mean = 0 and variance = 1)
    scaler = StandardScaler()

    #Fit on training set only
    scaler.fit(train_x)

    #Apply transform to both the training set and the test set
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    #Neural network
    mlp = MLPClassifier(hidden_layer_sizes=(25,25,25), max_iter=500)

    #Apply PCA
    pca = PCA(n_components=16)
    pca.fit(train_x, train_y)

    #Evaluate PCA pipeline
    pipe = Pipeline([('pca', pca), ('neural network', mlp)])
    pipe.fit(train_x, train_y)
    predictions = pipe.predict(test_x)

    #Confusion matrix and classification report
    print(sorted(test_y.unique()))
    print(confusion_matrix(test_y, predictions))
    print(classification_report(test_y, predictions))

['CH', 'CU', 'FA', 'FC', 'FS', 'SI', 'SL']
[[ 232    0   10    1    2    6    4]
 [   0  295    0    0    0    0   14]
 [  12    0 1432    3    0   60    5]
 [   0    0    1   30    0    1    2]
 [   1    0    0    0   14    0    0]
 [   2    0   37    0    0  161    0]
 [   0   13    4    4    0    0  517]]
             precision    recall  f1-score   support

         CH       0.94      0.91      0.92       255
         CU       0.96      0.95      0.96       309
         FA       0.96      0.95      0.96      1512
         FC       0.79      0.88      0.83        34
         FS       0.88      0.93      0.90        15
         SI       0.71      0.81      0.75       200
         SL       0.95      0.96      0.96       538

avg / total       0.94      0.94      0.94      2863

['CH', 'CU', 'FA', 'SI', 'SL']
[[ 73   0   3   1   1]
 [  0 124   0   0   0]
 [  2   0 307  18   0]
 [  0   0  14  78   0]
 [  2   0   0   0  84]]
             precision    recall  f1-score   support

         