## Deep learning regression for attendance prediction

In [1]:
import datetime
import warnings
# warnings.simplefilter("ignore", UserWarning)
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import OneHotEncoder
import random
from tqdm.autonotebook import tqdm
from sklearn.preprocessing import StandardScaler

## Setup dataset and collate function code

In [5]:
class BaseballDataset(Dataset):
    def __init__(
        self,
        split: str,
    ):
        assert split in ["train", "test"], "Invalid split"
        self.df = pd.read_csv(f"models-v2-lle//{split}set_w_lle.csv")
        # self.df = pd.read_csv(f"statistical-learning/processed_{split}.csv")
        number_col = ['previous_5_to_10MA']
        xscaler = StandardScaler().fit(self.df[number_col])
        self.df[number_col] = xscaler.transform(self.df[number_col])
        self.df.drop(['venue', 'start_hour', 'start_time', 'game_page_url', 'wind_speed'], axis=1, inplace=True)
        self.df = pd.get_dummies(self.df, columns=['team1_name', 'team2_name', 'season_type'])

        self.split = split

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        return {
            "y": row["attendance"],
            "x": list(row.drop("attendance")),
            # "x": list(ret.values()) + venue,
        }


def collate_fn(batch):
    y = [data["y"] for data in batch]
    x = [data["x"] for data in batch]

    return {
        "x": torch.Tensor(x),
        "y": torch.Tensor(y),
    }

In [6]:
dataset = BaseballDataset(split="train")
len(dataset.__getitem__(1000)["x"])

107

### Configure model

In [7]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(107, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 128)
        self.fc4 = nn.Linear(128, 1)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.zeros_(self.fc1.bias)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.zeros_(self.fc2.bias)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.zeros_(self.fc3.bias)
        nn.init.xavier_uniform_(self.fc4.weight)
        nn.init.zeros_(self.fc4.bias)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))

        return self.fc4(x)

## Set up for training

In [85]:
class cfg:
    epoch = 20
    lr = 2e-3
    weight_decay = 0
    batch_size = 32
    random_seed = 1

In [86]:
torch.manual_seed(cfg.random_seed)
torch.use_deterministic_algorithms(True)
net = Net()
loss_fn = nn.L1Loss()
optimizer = torch.optim.AdamW(net.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
# optimizer = torch.optim.SGD(net.parameters(), lr=cfg.lr)

train_dataset = BaseballDataset(split="train")
test_dataset = BaseballDataset(split="test")

train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, collate_fn=collate_fn, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, collate_fn=collate_fn, shuffle=True)

In [87]:
for epoch in range(0, cfg.epoch):
    # tqdm_object = tqdm(train_dataloader, total=len(train_dataloader))
    net.train()
    loss_list = list()
    for data in tqdm(train_dataloader):
        x, y = data["x"], data["y"]

        optimizer.zero_grad()
        outputs = net(x)
        # print(outputs.squeeze().size(), y.size())
        loss = loss_fn(outputs.squeeze(), y)

        loss.backward()
        loss_list.append(loss.squeeze().item())
        optimizer.step()

    print(f"Epoch: {epoch + 1}, loss: {sum(loss_list)/len(loss_list)}")
    loss_list = list()
    net.eval()
    # eval_tqdm_object = tqdm(test_dataset, total=len(test_dataloader))
    with torch.no_grad():
        for data in test_dataloader:
            x, y = data["x"], data["y"]
            outputs = net(x)
            # print(outputs.squeeze(-1).size(), y.size())
            loss = loss_fn(outputs.squeeze(-1), y)
            loss_list.append(loss)
    print(f"Epoch: {epoch + 1}, eval loss: {sum(loss_list)/len(loss_list)}")
            # tqdm_object.set_postfix(train_loss=loss.item())

  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 1, loss: 9229.936533636586
Epoch: 1, eval loss: 8991.453125


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 2, loss: 5374.487646356886
Epoch: 2, eval loss: 5066.2802734375


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 3, loss: 4398.635842266033
Epoch: 3, eval loss: 5085.9169921875


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 4, loss: 4262.256152598728
Epoch: 4, eval loss: 5087.05517578125


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 5, loss: 4301.626765716792
Epoch: 5, eval loss: 6787.62158203125


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 6, loss: 4170.8296982090405
Epoch: 6, eval loss: 4913.1171875


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 7, loss: 4192.034609961447
Epoch: 7, eval loss: 5162.1220703125


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 8, loss: 4079.5691631904783
Epoch: 8, eval loss: 5241.41748046875


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 9, loss: 4140.270866692845
Epoch: 9, eval loss: 4906.07958984375


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 10, loss: 4055.6937188927873
Epoch: 10, eval loss: 4889.20166015625


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 11, loss: 4067.7595680177055
Epoch: 11, eval loss: 5023.74267578125


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 12, loss: 4075.6680069965732
Epoch: 12, eval loss: 5013.38330078125


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 13, loss: 4002.2482980274967
Epoch: 13, eval loss: 5489.39013671875


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 14, loss: 4015.1127738454634
Epoch: 14, eval loss: 4983.8603515625


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 15, loss: 3995.7579476378914
Epoch: 15, eval loss: 4892.13623046875


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 16, loss: 3900.662714945741
Epoch: 16, eval loss: 5006.1943359375


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 17, loss: 3971.9668045243147
Epoch: 17, eval loss: 5627.5869140625


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 18, loss: 3927.338638345504
Epoch: 18, eval loss: 4846.1103515625


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 19, loss: 4004.044986894174
Epoch: 19, eval loss: 5380.2744140625


  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: 20, loss: 3961.411487230744
Epoch: 20, eval loss: 4819.75537109375


In [88]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

pred, gt = list(), list()
for i in range(len(test_dataset)):
    x, y = test_dataset.__getitem__(i)["x"], test_dataset.__getitem__(i)["y"]
    output = net(torch.Tensor(x))
    pred.append(output.item())
    gt.append(y)

mse = mean_squared_error(gt, pred, squared=True)
rms = mean_squared_error(gt, pred, squared=False)
mae = mean_absolute_error(gt, pred)
mape = mean_absolute_percentage_error(gt, pred)

mse, rms, mae, mape

(40302627.78102215, 6348.435065511984, 4819.755506893852, 0.2239742898632815)

In [89]:
torch.save(net.state_dict(), "models-v2-lle/model/deep_regression.pt")