In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

#Read training data (manually deleted rows 4296, 6420, 8516 - bad data)
df_train = pd.read_csv(r'C:\Users\Rick Ahlf\Downloads\train.csv')
test = pd.read_csv(r'C:\Users\Rick Ahlf\Downloads\test.csv')

#Create copy for categorical conversions
df_test = test.copy()

#pitch_id is not included, mainly a surrogate index
features = ['pitcher', 'batter_side', 'inning', 'half',
   'outs', 'balls', 'strikes', 'x55', 'y55', 'z55', 'vx55', 'vy55', 'vz55',
   'ax', 'ay', 'az', 'pitch_speed', 'release_x', 'release_z',
   'release_angle_x', 'release_angle_z', 'extension', 'break_x', 'break_z',
   'induced_break_z', 'spin_rate', 'spin_axis', 'pfx_x', 'pfx_z',
   'pfx_xLONG', 'pfx_zLONG', 'approach_angle_x', 'approach_angle_z',
   'plate_x', 'plate_z']

#Convert Left/Right to 1/0
df_train['pitcher_side'] = np.where(df_train['pitcher_side']=='Left', 1, 0)
df_train['batter_side'] = np.where(df_train['batter_side']=='Left', 1, 0)
df_test['pitcher_side'] = np.where(df_test['pitcher_side']=='Left', 1, 0)
df_test['batter_side'] = np.where(df_test['batter_side']=='Left', 1, 0)

#Convert pitcher from categorical to numeric
le = LabelEncoder()
df_train['pitcher'] = le.fit_transform(df_train['pitcher'])
df_test['pitcher'] = le.fit_transform(df_test['pitcher'])

def apply_model(x):

    p = x['pitcher_side'].unique()[0]

    #Limit to training data to LHP or RHP
    df_train_p = df_train[df_train['pitcher_side']==p]

    #Split train.csv into training and test sets (features and known classes)
    train_x, _, train_y, _ = train_test_split(df_train_p.loc[:, features], df_train_p['type'], test_size=0.2, random_state=0)
    test_x = x.loc[:, features]

    #Standardize data set to unit scale (mean = 0 and variance = 1)
    scaler = StandardScaler()

    #Fit on training set only
    scaler.fit(train_x)

    #Apply transform to training set and test set
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    #Create neural network
    mlp = MLPClassifier(hidden_layer_sizes=(40,40,40), max_iter=500)

    #Apply PCA
    pca = PCA(n_components=16)
    pca.fit(train_x, train_y)

    #Evaluate PCA pipeline
    pipe = Pipeline([('pca', pca), ('neural network', mlp)])
    pipe.fit(train_x, train_y)
    predictions = pipe.predict(test_x)

    return pd.Series(predictions, index=x.index)

test['type'] = df_test.groupby('pitcher_side', group_keys=False).apply(apply_model)

# test.to_csv('classified_test.csv', index=False)