In [None]:
!pip install pytorch_forecasting
!pip install pytorch_lightning
!pip install dill

In [None]:
#import the libraries
import torch
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from pytorch_lightning import loggers
import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
random.seed(30)
np.random.seed(30)
tf.random.set_seed(30)
torch.manual_seed(30)

torch.cuda.manual_seed(30)
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder,GroupNormalizer
from pytorch_forecasting.data.examples import generate_ar_data
from torchmetrics import TweedieDevianceScore
from pytorch_forecasting.metrics import RMSE, MAPE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
import gc
from tqdm import tqdm
import holidays
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
import dill as pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
gl= pd.read_csv("/content/drive/Shareddrives/MinneMUDAC/data/final_gl_Lan.csv")

  gl= pd.read_csv("/content/drive/Shareddrives/MinneMUDAC/data/final_gl_Lan.csv")


Temporal Fusion Transformer model is used for before the season prediction. Thus we will only choose the features that are known before the season.

In [None]:
selected_columns = [
    # Date & Time Info
    'Date', 'NumberofGames', 'DayofWeek', 'year', 'month', 'week',
    'is_weekend', 'opening_day', 'is_public_holiday', 'holidayName',

    # Home Team Info
    'HomeTeam', 'HomeTeamLeague', 'HomeTeamGameNumber', 'HomeTeam_City',
    'HomeTeam_State', 'HomeTeam_ws_winner', 'MVP_in_hometeam',
    'Cy_Young_in_hometeam', 'home_pitch10', 'home_bat10', 'home_field10',

    # Visiting Team Info
    'VisitingTeam', 'VisitingTeamLeague', 'VisitingTeamGameNumber', 'VisitingTeam_City',
    'VisitingTeam_State', 'VisitingTeam_ws_winner', 'MVP_in_visitingteam',
    'Cy_Young_in_visitingteam', 'visiting_pitch10', 'visiting_bat10', 'visiting_field10',

    # Game Info
    'BallParkID', 'Attendance', 'Capacity',

    # Season Stats
    'season_end_rank', 'season_end_w_l_ratio', 'season_end_runs_mean',
    'season_end_runs_allowed_mean', 'opponent_season_end_rank',
    'opponent_season_end_w_l_ratio', 'opponent_season_end_runs_mean',
    'opponent_season_end_runs_allowed_mean',

    # Other Teams & Info
    'NBA_Team', 'NFL_Team', 'NHL_Team', 'home_as_cnt', 'visiting_as_cnt',
    'previous_home_as_cnt', 'previous_visiting_as_cnt'
]

tran_df = gl[selected_columns]

The next part is preparing the data for model, which includes:

*   Create Home Game Number as time index
*   Removed unwanted years due to covid
*   Deal with missing data
*   Convert to the appropriate column types.



In [None]:
# Sort values as given
tran_df = tran_df.sort_values(["HomeTeam", "Date", "NumberofGames"])

# Rank each row within each 'HomeTeam' group and store as integer
tran_df["date_int"] = tran_df.groupby("HomeTeam").cumcount() + 1

In [None]:
#Removed the Tie Breaker Game which is game 163 as it does not provide more information and it mess up with the time series encoding for each team.
tran_df=tran_df[(tran_df['HomeTeamGameNumber']!=163)]

In [None]:
#Removed year 2020 and 2021 due to COVID and 2023 for our prediction submission
excluded_years = [2020, 2021, 2023]
tran_df_2022 = tran_df[~tran_df['year'].isin(excluded_years)]
prediction = tran_df[tran_df['year'] == 2023]

In [None]:
def fill_missing_values(df, reference_df):
    # Compute mean attendance from the reference DataFrame and merge with the target DataFrame
    mean_attendance = reference_df.groupby("HomeTeam")['Attendance'].mean().reset_index().rename(columns={'Attendance':"mean_attendance"})
    df = df.merge(mean_attendance, on="HomeTeam", how="left")

    # Replace NaN or 0 values in the 'Attendance' column with the mean attendance
    df['Attendance'] = np.where((df['Attendance'].isna()) | (df['Attendance'] == 0), df['mean_attendance'], df['Attendance'])

    # Handle other NaN values based on column datatype
    for col in df.columns:
        if df[col].dtype in ["int64", "float64"]:
            df[col].fillna(-1, inplace=True)
        elif df[col].dtype == "object":
            df[col].fillna("na", inplace=True)

    return df

# Usage
tran_df_2022 = fill_missing_values(df_to_fill, tran_df_2022)

In [None]:
# List columns by their target data type

# Columns to convert to category
category_columns = [
    'DayofWeek', 'VisitingTeam', 'VisitingTeamLeague', 'HomeTeam', 'HomeTeamLeague',
    'BallParkID', 'year', 'month', 'week', 'is_weekend', 'opening_day',
    'is_public_holiday', 'holidayName', 'HomeTeam_City', 'HomeTeam_State',
    'VisitingTeam_City', 'VisitingTeam_State', 'NBA_Team', 'NFL_Team', 'NHL_Team',
    'HomeTeam_ws_winner', 'VisitingTeam_ws_winner', 'MVP_in_hometeam',
    'MVP_in_visitingteam', 'Cy_Young_in_hometeam', 'Cy_Young_in_visitingteam'
]

# Columns to convert to float32
float32_columns = ['Attendance']

# Convert columns to their respective types
for col in category_columns:
    tran_df_2022[col] = tran_df_2022[col].astype(str).astype('category')

for col in float32_columns:
    tran_df_2022[col] = tran_df_2022[col].astype(np.float32)

This next steps help us in converting our dataframe into a format that's more amenable for modeling, ensuring we capture both temporal and non-temporal patterns efficiently

In [None]:
# Splitting the dataframe into training and test sets based on year.
train = tran_df_2022[pd.to_datetime(tran_df_2022['Date']).dt.year < 2022]
test = tran_df_2022[pd.to_datetime(tran_df_2022['Date']).dt.year == 2022]

# Defining the length of the encoder (historical data) and prediction (future data).
max_prediction_length = 81
max_encoder_length = 500

# Construct a TimeSeriesDataSet for training.
training = TimeSeriesDataSet(
    train,  # The training dataframe
    time_idx="date_int",  # Index to order the time series
    target="Attendance",  # Target variable to predict
    group_ids=["HomeTeam"],  # Categories to treat as separate series

    # Defining the length of the encoder and prediction windows.
    min_encoder_length=max_prediction_length // 2,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,

    # Static categorical variables
    static_categoricals=["HomeTeam", "HomeTeamLeague", "HomeTeam_City", "HomeTeam_State"],

    # Time-varying categorical variables that are known in advance
    time_varying_known_categoricals=[
        'DayofWeek', 'year', 'VisitingTeam', 'VisitingTeamLeague', 'month', 'week', 'is_weekend',
        'opening_day', 'is_public_holiday', 'HomeTeam_ws_winner', 'VisitingTeam_ws_winner',
        'MVP_in_visitingteam', 'MVP_in_hometeam', 'Cy_Young_in_visitingteam', 'Cy_Young_in_hometeam'
    ],

    # Static real (continuous) variables
    static_reals=['Capacity'],

    # Time-varying continuous variables that are known in advance
    time_varying_known_reals=[
        "home_pitch10", "home_bat10", "home_field10", "visiting_pitch10", "visiting_bat10", 
        "visiting_field10", 'HomeTeam_ws_winner', 'VisitingTeam_ws_winner', 'Capacity', 
        'season_end_rank', 'season_end_w_l_ratio', 'season_end_runs_mean', 'season_end_runs_allowed_mean',
        'opponent_season_end_rank', 'opponent_season_end_w_l_ratio', 'opponent_season_end_runs_mean',
        'opponent_season_end_runs_allowed_mean', "VisitingTeamGameNumber", "HomeTeamGameNumber"
    ],

    # Time-varying continuous variables that are unknown in advance (to be predicted)
    time_varying_unknown_reals=['Attendance'],

    # Normalize the target variable using the softplus function and grouped by HomeTeam.
    target_normalizer=GroupNormalizer(groups=["HomeTeam"], transformation="softplus"),

    # Add lags of the target variable to use as features.
    lags={'Attendance': [1, 3, 5, 7, 14]},

    # Additional configurations.
    add_encoder_length=True,
    add_relative_time_idx=True,
    add_target_scales=True,
    allow_missing_timesteps=True,

    # Encoders for handling NaN values in certain categorical columns.
    categorical_encoders={
        "HomeTeam_ws_winner": NaNLabelEncoder(add_nan=True),
        "VisitingTeam_ws_winner": NaNLabelEncoder(add_nan=True),
        'year': NaNLabelEncoder(add_nan=True)
    }
)

# Create a validation set from the training data by predicting the last max_prediction_length data points.
validation = TimeSeriesDataSet.from_dataset(training, train, predict=True, stop_randomization=True)

# Convert datasets into PyTorch dataloaders.
batch_size = 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)



### Use the built-in function to train the hyperparameters

In [None]:
# import pickle

# from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

# # create study
# study = optimize_hyperparameters(
#     train_dataloader,
#     val_dataloader,
#     model_path="optuna_test",
#     n_trials=200,
#     max_epochs=50,
#     gradient_clip_val_range=(0.01, 1.0),
#     hidden_size_range=(8, 128),
#     hidden_continuous_size_range=(8, 128),
#     attention_head_size_range=(1, 4),
#     learning_rate_range=(0.001, 0.1),
#     dropout_range=(0.1, 0.3),
#     trainer_kwargs=dict(limit_train_batches=30),
#     reduce_on_plateau_patience=4,
#     use_learning_rate_finder=False,  # use Optuna to find ideal learning rate or use in-built learning rate finder
# )

# # save study results - also we can resume tuning at a later point in time
# with open("test_study.pkl", "wb") as fout:
#     pickle.dump(study, fout)

# # show best hyperparameters
# print(study.best_trial.params)

Trial 0 finished with value: 981.5067749023438 and parameters: {'gradient_clip_val': 0.47392885281873304, 'hidden_size': 17, 'dropout': 0.12057343228729256, 'hidden_continuous_size': 16, 'attention_head_size': 3, 'learning_rate': 0.012067986205675296}.

In [None]:
from pytorch_lightning.loggers import TensorBoardLogger

# Define hyperparameters and configurations
PATIENCE = 30
MAX_EPOCHS = 200
LEARNING_RATE = 0.012067986205675296
OPTUNA = False  
# Define an early stopping callback to stop training if "train_loss" does not improve by at least 0.001 for 30 consecutive epochs.
early_stop_callback = EarlyStopping(monitor="train_loss", min_delta=0.001, patience=PATIENCE, verbose=False, mode="min")

# Callback to log the learning rate during training.
lr_logger = LearningRateMonitor()

# Define a TensorBoard logger to log results to TensorBoard.
logger = TensorBoardLogger("lightning_logs")

# Create a PyTorch Lightning Trainer instance with various configurations and callbacks.
trainer = pl.Trainer(
    max_epochs=MAX_EPOCHS,
    gpus=1,  # Use 1 GPU for training
    enable_model_summary=True,  # Display model summary
    gradient_clip_val=0.47392885281873304,  # Gradient clipping value
    limit_train_batches=10,  # Limit the number of training batches
    callbacks=[lr_logger, early_stop_callback],  # Add defined callbacks
    logger=logger,  # Use the defined TensorBoard logger
)

# Initialize the Temporal Fusion Transformer model using dataset-specific settings.
tft = TemporalFusionTransformer.from_dataset(
    training,  # Training dataset
    learning_rate=LEARNING_RATE,
    lstm_layers=2,
    hidden_size=17,
    attention_head_size=3,
    dropout=0.12057343228729256,
    hidden_continuous_size=16,
    output_size=1,  # Only 1 output size
    loss=MAPE(),  # Use Mean Absolute Percentage Error as the loss function
    log_interval=10,
    reduce_on_plateau_patience=4
)

# Move the model to the desired device  'cuda'
tft.to(DEVICE)

# Print the number of parameters in the model.
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")


  rank_zero_deprecation(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Number of parameters in network: 99.6k


In [None]:
trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

# load the best model according to the validation loss
# (given that we use early stopping, this is not necessarily the last epoch)
best_model_path = trainer.checkpoint_callback.best_model_path
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

model_path = '/content/drive/Shareddrives/MinneMUDAC/Model/tft/tft_model_2022.bin'
pickle.dump(best_tft, open(model_path, 'wb'))

In [None]:
# Set the path to the saved model on the shared drive
model_path = '/content/drive/Shareddrives/MinneMUDAC/Model/tft/tft_model_2022.bin'

# Load the best TFT model using pickle from the given path
best_tft = pickle.load(open(model_path, 'rb'))

# Concatenate the actual values (targets) from the validation dataloader
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])

# Predict the values using the loaded TFT model on the validation dataloader
predictions = best_tft.predict(val_dataloader)
print( 'MAPE:',(np.abs(actuals - predictions)/actuals).mean())
print( 'RMSE:',np.sqrt(((actuals - predictions)**2).mean()))

MAPE: tensor(0.0729)
RMSE: tensor(3021.1316)


In [None]:
# raw predictions are a dictionary from which all kind of information including quantiles can be extracted
raw_predictions, x = best_tft.predict(val_dataloader, mode="raw", return_x=True)
for idx in range(10):  # plot 10 examples
    best_tft.plot_prediction(x, raw_predictions, idx=idx, add_loss_to_title=True);

In [None]:
# Interpret the model's output using the interpret_output method. 
# The 'reduction' argument is set to "sum", which aggregates the interpretation results by summing them across samples or features.
interpretation = best_tft.interpret_output(raw_predictions, reduction="sum")
best_tft.plot_interpretation(interpretation)

In [None]:
model_path = '/content/drive/Shareddrives/MinneMUDAC/Model/tft/tft_model_2022.bin'
best_tft=pickle.load(open(model_path, 'rb'))
# prepare last_data for prediction


In [None]:
def testing_MAPE(tft):
  # max_date = train.groupby("HomeTeam")["date_int"].max().reset_index()
  # last_data = max_date.merge(train, on = ["HomeTeam","date_int"], how = "left")

# select last 500 from data (max_encoder_length is 500)
  encoder_data = train.groupby('HomeTeam').tail(500)

  decoder_data = test
# combine encoder and decoder data
  new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)

  # raw return point estimate
  # 'quantiles: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]'
  new_raw_predictions, new_x = tft.predict(new_prediction_data, mode="raw", return_x=True)
  pred=new_raw_predictions[0].numpy().reshape(2430,)
  pred=np.delete(pred,180)
  actual=test['Attendance'].array

  return (np.abs(actual - pred)/actual).mean()

In [None]:
testing_MAPE(best_tft)



0.17994474

### Prepare the prediction of 2023 seasons for submission

In [None]:
#Choose the last 500 games as encoder 
encoder_data = train.groupby('HomeTeam').tail(500)

decoder_data = prediction

# combine encoder and decoder data
new_prediction_data = pd.concat([encoder_data, decoder_data], ignore_index=True)

# raw return point estimate
# 'quantiles: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]'
new_raw_predictions, new_x = best_tft.predict(new_prediction_data, mode="raw", return_x=True)

#reshape it to 1 dimension
pred=new_raw_predictions[0].numpy().reshape(2430,)



In [None]:
# Assigning predicted attendance values to the 'predicted_attendance' column in the 'prediction' DataFrame
prediction['predicted_attendance'] = pred

# Reading the MLB schedule CSV file into the 'schedule' DataFrame
schedule = pd.read_csv("/content/drive/Shareddrives/MinneMUDAC/data/2023_MLBSchedule.csv")

# Sorting the 'schedule' DataFrame based on 'home_team', 'game_date', and 'game_time'
predicted = schedule.sort_values(["home_team", "game_date", "game_time"])

# Assigning the 'predicted_attendance' values from the 'prediction' DataFrame to the sorted 'predicted' DataFrame
predicted['predicted_attendance'] = prediction['predicted_attendance'].array

# Adding a new column 'minnemudac_teamid' to the 'schedule' DataFrame and setting its value to 'G07' for all rows
schedule['minnemudac_teamid'] = "G07"



In [None]:
predicted

Unnamed: 0,game_date,game_time,stadium_name,home_team,away_team,predicted_attendance
106,20230407,18:38:00,Angel Stadium,ANA,TOR,35127.917969
120,20230408,18:07:00,Angel Stadium,ANA,TOR,35733.191406
129,20230409,13:07:00,Angel Stadium,ANA,TOR,33069.238281
146,20230410,18:38:00,Angel Stadium,ANA,WAS,24217.949219
159,20230411,18:38:00,Angel Stadium,ANA,WAS,23909.673828
...,...,...,...,...,...,...
2276,20230920,13:05:00,Nationals Park,WAS,CHA,14314.535156
2292,20230921,19:05:00,Nationals Park,WAS,ATL,15778.401367
2304,20230922,19:05:00,Nationals Park,WAS,ATL,16835.603516
2311,20230923,13:05:00,Nationals Park,WAS,ATL,21721.699219


In [None]:
schedule=schedule.merge(predicted,on=["game_date","game_time","home_team","away_team","stadium_name"])

In [None]:
schedule

Unnamed: 0,game_date,game_time,stadium_name,home_team,away_team,minnemudac_teamid,predicted_attendance
0,20230330,13:05:00,Yankee Stadium,NYA,SFN,G07,52221.074219
1,20230330,13:05:00,Nationals Park,WAS,ATL,G07,30136.009766
2,20230330,13:10:00,Petco Park,SDN,COL,G07,45646.238281
3,20230330,13:20:00,Wrigley Field,CHN,MIL,G07,40740.242188
4,20230330,14:10:00,Fenway Park,BOS,BAL,G07,37244.484375
...,...,...,...,...,...,...,...
2425,20231001,15:05:00,Oriole Park at Camden Yards,BAL,BOS,G07,33817.375000
2426,20231001,15:05:00,PNC Park,PIT,MIA,G07,14694.962891
2427,20231001,15:07:00,Rogers Centre,TOR,TBA,G07,32803.917969
2428,20231001,15:10:00,Citi Field,NYN,PHI,G07,44951.597656


In [None]:
schedule.to_csv("/content/drive/Shareddrives/MinneMUDAC/Final Prediction/2023_MLBSchedule.csv")