# Testing the Player Dataloader

In [1]:
# Imports
import pandas as pd
from torch.utils.data import DataLoader, Dataset

In [5]:
# Data Paths
file = './Data/player/processed/player_data.xlsx'

In [26]:
cols = [
    'Rk', 'Player', 'Age', 'Tm', 'Pos', 'GP', 'G', 'A', 'PTS', '+/-', 'PIM',
    'PS', 'EV', 'PP', 'SH', 'GW', 'EV.1', 'PP.1', 'SH.1', 'S', 'S%', 'TOI', 'ATOI', 'Season'
]
df = pd.read_excel(file, header=0, usecols=cols)

df = df.replace('\*', '', regex=True)
df['original_index'] = df.index
df = df.sort_values(by='GP', ascending=False)
df = df.groupby(['Player', 'Age', 'Season']).filter(lambda x: x['GP'].max() > 47)
df = df.drop_duplicates(subset=['Player', 'Age', 'Season'], keep='first')
df = df.sort_values(by='original_index')
df = df.drop(columns=['original_index'])

In [27]:
class PlayerDataset(Dataset):
    def __init__(self, df, N=5, start_season=1990, stop_season=2023):
        """
        Players dataset

        :param df: DataFrame of preprocessed players dataset
        :param N: number of consecutive seasons to load per player. The label will be the season N+1
        :param start_season: first season to consider (inclusive)
        :param stop_season: last season to consider (inclusive)
        """
        self.N = N
        self.start_season = start_season
        self.stop_season = stop_season
        self.alldata = df
        self.data = self.load_player_data(self.alldata, self.start_season, self.stop_season, self.N)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_element = self.data[idx]
        return data_element[0], data_element[1]

    def set_seasons_per_group(self, N):
        self.N = N
        self.data = self.load_player_data(self.alldata, self.start_season, self.stop_season, self.N)

    def load_player_data(self, df: pd.DataFrame, start_season: int, stop_season: int, N: int):
        players = df['Player'].unique()
        dataset = []

        for player in players:
            player_data = df[df['Player'] == player]
            for season in range(start_season, stop_season - N + 2):
                features = []
                for s in range(season, season + N):
                    season_data = player_data[player_data['Season'] == s]
                    if season_data.empty:
                        break
                    features.append(season_data)

                if len(features) != N:
                    continue

                target = player_data[player_data['Season'] == season + N]
                if target.empty:
                    continue

                dataset.append([features, target])

        return dataset


In [28]:
dataset = PlayerDataset(df)

In [20]:
dt = dataset.load_player_data(df, 1990, 2024, 5)
print(dataset.__len__())
print(dataset.__getitem__(0))
#print(dt)

1055
([        Rk       Player   Age   Tm Pos  GP   G   A  PTS  +/-  ...  SH  GW  \
11617  141  Zdeno Chára  36.0  BOS   D  77  17  23   40   25  ...   0   3   

       EV.1  PP.1  SH.1    S    S%     TOI                 ATOI  Season  
11617    17     5     1  168  10.1  1898.0  1900-01-01 00:39:00    2014  

[1 rows x 24 columns],         Rk       Player   Age   Tm Pos  GP  G   A  PTS  +/-  ...  SH  GW  \
12125  150  Zdeno Chára  37.0  BOS   D  63  8  12   20    0  ...   0   0   

       EV.1  PP.1  SH.1    S   S%     TOI      ATOI  Season  
12125     8     1     3  138  5.8  1471.0  23:21:00    2015  

[1 rows x 24 columns],         Rk       Player   Age   Tm Pos  GP  G   A  PTS  +/-  ...  SH  GW  \
12601  127  Zdeno Chára  38.0  BOS   D  80  9  28   37   12  ...   0   3   

       EV.1  PP.1  SH.1    S   S%     TOI                 ATOI  Season  
12601    17     9     2  158  5.7  1928.0  1900-01-01 00:06:00    2016  

[1 rows x 24 columns],         Rk       Player   Age   Tm Pos  GP

In [25]:
from torch.utils.data import random_split, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

batch_size = 100
test_length = len(dataset) // 10
train_length = len(dataset) - test_length

print(f"Test set length: {test_length}")
print(f"Train set length: {train_length}")

# Splitting the dataset into train and test sets
dataset_test, dataset_train = random_split(dataset, [test_length, train_length])

def generate_batch(data_batch):
    content_batch, title_batch = [], []

    for (content, title) in data_batch:
        content_tensors = [torch.tensor(c.values, dtype=torch.float) for c in content]
        title_tensor = torch.tensor(title.values, dtype=torch.float)

        content_batch.append(torch.cat(content_tensors, dim=0))
        title_batch.append(title_tensor)

    content_batch = pad_sequence(content_batch, padding_value=0.0, batch_first=True)
    title_batch = pad_sequence(title_batch, padding_value=0.0, batch_first=True)

    return content_batch, title_batch


dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)


Test set length: 412
Train set length: 3710
