# File: Use an LSTM model to predict ideal pitch sequencing between any pitcher/hitter pair

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.optimizers import Adam

2025-02-23 19:52:01.710064: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Experimenting with weighting good / bad outcomes
* Using this as a temporary test in order to see how the model will react

In [12]:
df = pd.read_csv("pitch_data_ids.csv")

# Define effectiveness scores (arbitrarily)
effectiveness_mapping = {
    "Swinging Strike": 1.0,
    "Called Strike": 0.9,
    "Foul": 0.7,
    "Hit into Out": 0.65,
    "Single": 0.2,
    "Double": 0.05,
    "Triple": 0.05,
    "Ball": 0.35,
    "Hit by Pitch": 0.1,
    "Home Run": 0.0,
    "Hit into Error": 0.5
}

# Apply effectiveness score based on outcome
df["effectiveness"] = df["outcome"].map(effectiveness_mapping)

# Compute rolling effectiveness score within each at-bat
df["rolling_effectiveness"] = df.groupby("at_bat_id")["effectiveness"].expanding().mean().reset_index(level=0, drop=True)

# Fill in any missing values with a neutral value
df["rolling_effectiveness"] = df["rolling_effectiveness"].fillna(0.5)

# Save updated dataset
df.to_csv("pitch_data_effectiveness.csv", index=False)

print(df[["at_bat_id", "pitch_number", "outcome", "effectiveness", "rolling_effectiveness"]].head(20))

In [None]:
df = pd.read_csv("pitch_data_effectiveness.csv")

# Split the 'count' column into 'balls' and 'strikes'
df[['balls', 'strikes']] = df['count'].str.split('-', expand=True)
df['balls'] = df['balls'].astype(int)
df['strikes'] = df['strikes'].astype(int)
df = df.drop(columns=['count'])

# Encode categorical variables
encoders = {}
categorical_columns = ["pitch_type", "bat_side", "outcome"]
for col in categorical_columns:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

# Normalize velocity
scaler = MinMaxScaler()
df["velocity"] = scaler.fit_transform(df[["velocity"]])

grouped = df.groupby("at_bat_id")
sequences = []
labels = []

for _, group in grouped:
    # Get the pitches for the current at-bat
    X = group[["pitch_number", "balls", "strikes", "pitch_type", "velocity", "outcome", "effectiveness", "rolling_effectiveness"]].values
    # Get the next pitch type as the label
    y = group["pitch_type"].shift(-1).dropna().values  # Predict next pitch type
    
    sequences.append(X[:-1])  # Remove last pitch (no next pitch)
    labels.append(y)          # Use the rest as labels (the next pitch)

# Check if labels are aligned properly
print(f"Number of sequences: {len(sequences)}")
print(f"Number of labels: {len(labels)}")

# Pad sequences for uniform length
max_seq_length = max(len(seq) for seq in sequences)
X_padded = pad_sequences(sequences, maxlen=max_seq_length, padding="post", dtype="float32")

# Pad labels to match shape of X_padded
y_padded = pad_sequences(labels, maxlen=max_seq_length, padding="post", dtype="float32")

# Reshape labels to be 1D
y_padded = np.reshape(y_padded, (-1,))

# Check the shapes
print(f"X_padded shape: {X_padded.shape}")
print(f"y_padded shape: {y_padded.shape}")
print(np.unique(y_padded))

# Define the model
model = Sequential([
    Masking(mask_value=0.0, input_shape=(max_seq_length, X_padded.shape[2])),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(12, activation="softmax")  # 12 output units for 12 pitch types
])

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
model.fit(X_padded, y_padded, epochs=30, batch_size=64, validation_split=0.2)

In [None]:
model.summary()