<a href="https://colab.research.google.com/github/William-Metz/Baseball_Transformer/blob/main/Baseball_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup



##Imports

In [None]:
import torch
from torch import nn
from torch.nn import Transformer
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torch.optim as optim
import tqdm
from tqdm import tqdm
import numpy as np
import math
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


##Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


##Import Data

In [None]:
df = pd.read_csv('/content/drive/My Drive/BaseballNets/pitches.csv')
df_AB = pd.read_csv('/content/drive/My Drive/BaseballNets/atbats.csv')

## Data Manipulation

### Merges PitchData with AB data to get pitcher_id and batter_id

In [None]:
df =df.merge(df_AB, on='ab_id', how='left')
df.drop(['spin_rate', 'spin_dir', 'break_angle', 'break_length', 'break_y', 'ax', 'ay', 'az', 'sz_bot', 'sz_top', 'type_confidence', 'vx0', 'vy0', 'vz0', 'x', 'x0', 'y', 'y0', 'z0', 'pfx_x', 'pfx_z', 'nasty', 'code', 'zone', 'px', 'pz', 'o','stand', 'g_id', 'top', 'event_num', 'ab_id' ], axis=1, inplace=True)


### Split and Encode data

In [None]:
# Check for null values in the continuous columns and handle them if necessary
df.dropna(subset=['start_speed', 'end_speed'], inplace=True)

# Normalize Pitch Speed
scaler = StandardScaler()
df[['start_speed', 'end_speed']] = scaler.fit_transform(df[['start_speed', 'end_speed']])

# Select columns for features and targets using double brackets for a list of column names
feature_columns = df[['b_score', 'b_count', 's_count', 'outs', 'pitch_num', 'on_1b', 'on_2b', 'on_3b', 'inning', 'p_score', 'batter_id', 'pitcher_id', 'p_throws']]
categorical_targets = df[['pitch_type', 'event', 'type']]  # These are categorical
continuous_targets = df[['start_speed', 'end_speed']]  # These are continuous
print(categorical_targets)
print(feature_columns)
# Create dummy variables for categorical features
feature_columns = pd.get_dummies(feature_columns, columns=['pitcher_id', 'batter_id', 'p_throws'])

# Create integer codes for categorical targets
categorical_targets = pd.get_dummies(categorical_targets)

print(categorical_targets.shape)
print(continuous_targets.shape)


       pitch_type      event type
0              FF     Flyout    X
1              FF     Flyout    C
2              SL     Flyout    S
3              CH     Flyout    B
4              CH     Flyout    B
...           ...        ...  ...
387962         CU  Strikeout    W
387963         KC    Pop Out    B
387964         KC    Pop Out    B
387965         FC    Pop Out    B
387966         FC    Pop Out    C

[383774 rows x 3 columns]
        b_score  b_count  s_count  outs  pitch_num  on_1b  on_2b  on_3b  \
0             0        0        0     0          1      0      0      0   
1             0        0        0     1          1      0      0      0   
2             0        0        0     1          2      0      0      0   
3             0        0        1     1          3      0      0      0   
4             0        1        1     1          4      0      0      0   
...         ...      ...      ...   ...        ...    ...    ...    ...   
387962        6        2        1     2 

### Model Creation

In [None]:
class PitchDataset(Dataset):
    def __init__(self, features, targets, num_categorical):
        self.features = features
        self.targets = targets
        self.num_categorical = num_categorical

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Extract categorical targets (ensure this slice is correct)

        categorical_targets = self.targets[idx, :self.num_categorical]
        # Extract continuous targets (ensure this slice is correct)
        continuous_targets = self.targets[idx, self.num_categorical:]

        # Return the feature vector, and a tuple of categorical and continuous targets

        return self.features[idx], (categorical_targets, continuous_targets)


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x )

# Modify the Transformer model to have separate outputs for categorical and continuous targets
class TransformerModel(nn.Module):
    def __init__(self, input_size, num_classes_pitch_type, num_classes_event, num_classes_type, d_model, nhead, num_encoder_layers, dim_feedforward, dropout):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'

        # Transformer Encoder (shared)
        self.transformer_encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = TransformerEncoder(self.transformer_encoder_layer, num_layers=num_encoder_layers)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # Input linear layer to match the input size to d_model
        self.input_linear = nn.Linear(input_size, d_model)

        # Output linear layers for categorical targets
        self.output_linear_pitch_type = nn.Linear(d_model, num_classes_pitch_type)
        self.output_linear_event = nn.Linear(d_model, num_classes_event)
        self.output_linear_type = nn.Linear(d_model, num_classes_type)

        # Output linear layer for continuous targets (2 for start_speed and end_speed)
        self.output_linear_continuous = nn.Linear(d_model, 2)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        # Initialize input linear layer
        self.input_linear.bias.data.zero_()
        self.input_linear.weight.data.uniform_(-initrange, initrange)

        # Initialize output linear layers for categorical targets
        self.output_linear_pitch_type.bias.data.zero_()
        self.output_linear_pitch_type.weight.data.uniform_(-initrange, initrange)
        self.output_linear_event.bias.data.zero_()
        self.output_linear_event.weight.data.uniform_(-initrange, initrange)
        self.output_linear_type.bias.data.zero_()
        self.output_linear_type.weight.data.uniform_(-initrange, initrange)

        # Initialize output linear layer for continuous targets
        self.output_linear_continuous.bias.data.zero_()
        self.output_linear_continuous.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.input_linear(src)  # Adjust input to d_model size
        src = self.pos_encoder(src)
        transformer_output = self.transformer_encoder(src)

        # Assuming you want to use the last output for prediction
        last_output = transformer_output[:, -1, :]  # Select the last output for each batch

        # Categorical outputs
        output_pitch_type = self.output_linear_pitch_type(last_output)
        output_event = self.output_linear_event(last_output)
        output_type = self.output_linear_type(last_output)

        # Continuous outputs
        output_continuous = self.output_linear_continuous(last_output)

        return output_pitch_type, output_event, output_type, output_continuous



## Training

In [None]:
num_classes_pitch_type = 18  # There are 12 'pitch_type_' columns
num_classes_event = 30       # There are 50 'event_' columns
num_classes_type = 3        # There are 17 'type_' columns


# Convert the Pandas DataFrame to a PyTorch Tensor
num_continuous = continuous_targets.shape[1]  # Number of continuous variables
transformer_model = TransformerModel(input_size=len(feature_columns.columns),
                                     num_classes_pitch_type=num_classes_pitch_type,
                                     num_classes_event=num_classes_event,
                                     num_classes_type=num_classes_type,
                                     d_model=512,
                                     nhead=8,
                                     num_encoder_layers=6,
                                     dim_feedforward=2048,
                                     dropout=0.1)
# Loss functions
criterion_cat = nn.CrossEntropyLoss()  # For categorical targets
criterion_cont = nn.MSELoss()  # For continuous targets

optimizer = torch.optim.Adam(transformer_model.parameters(), lr=0.001)

num_epochs = 1

batch_size = 300000
categorical_targets_tensor = torch.tensor(categorical_targets[:1].values).float()
continuous_targets_tensor = torch.tensor(continuous_targets[0:1].values).float()
#if categorical_targets_tensor.shape[0] == continuous_targets_tensor.shape[0] and continuous_targets.shape[1] == 2:
  #    targets = torch.cat((categorical_targets_tensor, continuous_targets_tensor), dim=1)
#else:
 #   raise ValueError("There is a mismatch in the shapes of the target tensors after handling null values.")
targets = categorical_targets_tensor
val_features = torch.tensor(feature_columns[0:1].values).float()
val_targets = torch.tensor(targets)

# Split the data into batches
for idx in range(int(feature_columns.shape[0]/batch_size)):
  start_idx = idx * batch_size
  end_idx = start_idx + batch_size

  features = torch.tensor(feature_columns[start_idx:end_idx].values).float()

  categorical_targets_tensor = torch.tensor(categorical_targets[start_idx:end_idx].values).float()
  #continuous_targets_tensor = torch.tensor(continuous_targets[start_idx:end_idx].values).float()

  # If the shapes are correct, concatenate them
  #if categorical_targets_tensor.shape[0] == continuous_targets_tensor.shape[0] and continuous_targets.shape[1] == 2:
     # targets = torch.cat((categorical_targets_tensor, continuous_targets_tensor), dim=1)
    # targets = categorical_targets_tensor
 # else:
 #     raise ValueError("There is a mismatch in the shapes of the target tensors after handling null values.")
  targets = categorical_targets_tensor
  # Split the data into training and validation sets
  features_train, features_val, targets_train, targets_val = train_test_split(
      features, targets, test_size=0.80, random_state=42)

  val_features = torch.cat((val_features, features_val), dim=0)
  val_targets = torch.cat((val_targets, targets_val), dim=0)

  # Create a custom dataset class
  # Number of categorical targets is the sum of unique classes for 'pitch_type', 'event', and 'type'
  num_categorical = 51
  # Create Dataset objects for training and validation sets
  train_dataset = PitchDataset(features_train, targets_train, num_categorical)
  val_dataset = PitchDataset(features_val, targets_val, num_categorical)

  # Create DataLoaders for batching
  train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
  val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
  num_classes = categorical_targets.shape[1]  # Number of categorical classes

  # Training loop
  # Training loop
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  print(f"Using device: {device}")
  transformer_model = transformer_model.to(device)
  for epoch in range(0):
      transformer_model.train()  # Set the model to training mode
      total_loss = 0


      train_loader_tqdm = tqdm(train_loader)

      for i, (inputs, (targets_cat_tuple, targets_cont)) in enumerate(train_loader_tqdm):
          inputs, targets_cat_tuple, targets_cont = inputs.to(device), targets_cat_tuple.to(device), targets_cont.to(device)

          optimizer.zero_grad()

          # Forward pass
          outputs_pitch_type, outputs_event, outputs_type, outputs_cont = transformer_model(inputs)

          # Convert the target tensors for categorical targets to Long
          targets_pitch_type = targets_cat_tuple[:, 0].long()
          targets_event = targets_cat_tuple[:, 1].long()
          targets_type = targets_cat_tuple[:, 2].long()

          # Calculate loss for pitch_type categorical target
          loss_pitch_type = criterion_cat(outputs_pitch_type, targets_pitch_type)
          # Calculate loss for event categorical target
          loss_event = criterion_cat(outputs_event, targets_event)
          # Calculate loss for type categorical target
          loss_type = criterion_cat(outputs_type, targets_type)
          # Calculate loss for continuous targets


          # Combine losses
          loss = loss_pitch_type + loss_event + loss_type
          total_loss += loss.item()

          # Backward pass and optimize
          loss.backward()
          optimizer.step()

          # Update the progress bar with the loss information
          train_loader_tqdm.set_description(f'Epoch [{epoch+1}/{num_epochs}]')
          train_loader_tqdm.set_postfix(loss=loss.item())

      # Calculate average loss for the epoch
      avg_loss = total_loss / len(train_loader)
      print(f'Epoch [{epoch+1}/{5}] completed with average loss: {avg_loss:.4f}')




  val_targets = torch.tensor(targets)


Using device: cuda


#Eval

In [None]:
from sklearn.metrics import accuracy_score
import torch.nn.functional as F

# Set the model to evaluation mode
transformer_model.eval()
val_dataset = PitchDataset(val_features, val_targets, num_categorical)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# Disable gradient computation since we are in inference mode
with torch.no_grad():
    predictions_cat = []
    predictions_cont = []
    true_labels_cat = []
    true_labels_cont = []

    for inputs, (targets_cat_tuple, targets_cont) in val_loader:  # Use your validation data loader
        inputs = inputs.to(device)
        targets_cat_tuple = targets_cat_tuple.to(device)
        targets_cont = targets_cont.to(device)

        # Get the model outputs
        outputs_pitch_type, outputs_event, outputs_type, outputs_cont = transformer_model(inputs)

        # Convert categorical predictions to probabilities and then to the predicted class
        _, predicted_pitch_type = torch.max(F.softmax(outputs_pitch_type, dim=1), 1)
        _, predicted_event = torch.max(F.softmax(outputs_event, dim=1), 1)
        _, predicted_type = torch.max(F.softmax(outputs_type, dim=1), 1)

        # Store predictions and true labels
        predictions_cat.extend(torch.cat((predicted_pitch_type.unsqueeze(1), predicted_event.unsqueeze(1), predicted_type.unsqueeze(1)), dim=1).cpu().numpy())
        true_labels_cat.extend(targets_cat_tuple.cpu().numpy())

        # For continuous targets, if they were normalized, apply inverse transformation
        # Assuming `scaler` is your StandardScaler object used for normalization
        #predicted_cont = scaler.inverse_transform(outputs_cont.cpu().numpy())
       # true_cont = scaler.inverse_transform(targets_cont.cpu().numpy())

       # predictions_cont.extend(predicted_cont)
       # true_labels_cont.extend(true_cont)

# Calculate accuracy for categorical targets
predictions_cat = np.array(predictions_cat)
true_labels_cat = np.array(true_labels_cat)
accuracy_pitch_type = accuracy_score(true_labels_cat[:, 0], predictions_cat[:, 0])
accuracy_event = accuracy_score(true_labels_cat[:, 1], predictions_cat[:, 1])
accuracy_type = accuracy_score(true_labels_cat[:, 2], predictions_cat[:, 2])

# Output the accuracy
print(f'Accuracy for pitch type: {accuracy_pitch_type}')
print(f'Accuracy for event: {accuracy_event}')
print(f'Accuracy for type: {accuracy_type}')

# For continuous targets, calculate some regression metric, like MSE
predictions_cont = np.array(predictions_cont)
true_labels_cont = np.array(true_labels_cont)
#mse_start_speed = sklearn.metrics.mean_squared_error(true_labels_cont[:, 0], predictions_cont[:, 0])
#mse_end_speed = sklearn.metrics.mean_squared_error(true_labels_cont[:, 1], predictions_cont[:, 1])

# Output the regression metric
#print(f'MSE for start speed: {mse_start_speed}')
#print(f'MSE for end speed: {mse_end_speed}')

Accuracy for pitch type: 0.0
Accuracy for event: 0.11751201036662347
Accuracy for type: 0.3895192103366236


In [None]:
torch.save(transformer_model.state_dict(), 'model.pth')
torch.save(transformer_model, 'model_full.pth')
