# Team Dataloader

This script provides the dataloader for the teams

In [122]:
# Imports
import pandas as pd
from torch import nn
from torch.utils.data import random_split, DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [123]:
# Data Paths (TODO: abstract with class loader)
file = './Data/team/processed/team_data.xlsx'

In [124]:
# GET DESIRED FIELDS
# Season 
# Name
# Points Percentage : PTS%
# Win Percentage : Win%
# Goals For Per Game : GF/G
# Goals Against Per Game : GA/G
cols = ['team name', 'Season', 'PTS%', 'GF/G', 'GA/G']
df = pd.read_excel(file, header=0, usecols=cols)

# Remove the asterix from the team names
df = df.replace('\*', '', regex=True)


In [125]:
"""
From Tutorial 9
Adapted to instead load a batch for a given N.
For example, N=5 will load batches for teams which 
have 5 consecutive seasons played
"""
def generate_batch(data_batch):
  """
  Pad samples in the batch to the longest sequence within that batch.

  """
  content_batch, title_batch = [], []

  for (content, title) in data_batch:
    content_batch.append(torch.tensor(content))
    title_batch.append(torch.tensor(title))

  content_batch = pad_sequence(content_batch, padding_value=0, batch_first=True)
  title_batch = pad_sequence(title_batch, padding_value=0, batch_first=True)

  return content_batch, title_batch

# Dataset Class

This is the custom dataset class used to represent team data

In [126]:
class TeamDataset(Dataset):
    def __init__(self, df, N=5, start_season=1990, stop_season=2023):
        """
        Teams dataset

        :param file: path to preprocessed teams dataset
        :param N: number of consecutive seasons to load per group. Note that the associated label will be the season N+1
        :return: dataset
        """

        self.file=file
        self.N=N
        self.start_season=start_season
        self.stop_season=stop_season

        # Save the default full data
        self.alldata = df

        # Get all the possible groups
        self.data = self.load_team_data(self.alldata, self.start_season, self.stop_season, self.N)


    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # self.data has [features,target] at each ID
        data_element = self.data[idx]
        return data_element[0], data_element[1]
    
    def set_seasons_per_group(self, N):
        """
        Set the number of consecutive seasons (N) to load per group.
        This function also alters self.data by converting the original data to the new N

        :param N: number of consecutive seasons to load per group. Note that the associated label will be the season N+1
        """
        self.N = N

        # Get all the possible groups
        self.data = self.load_team_data(self.alldata, self.start_season, self.stop_season, self.N)


    def load_team_data(self, df: pd.DataFrame, start_season: int, stop_season: int, N: int):
        """
        load_team_data: loads a dictionary of team data by grouping multiple groups of 
        N seasons together for a range of seasons. 
        NOTE: only teams that appear for all seasons for a given group will be added

        :param data: NHL team dataset for all years
        :param start_seasons: First season to consider (inclusive).
        :param end_seasons: Last season to consider (inclusive).
        :param N: Number of seasons per group
        :return: returns a data structure with the following format (for num_seasons=N=5)
            V2:
            NOTE: a group of seasons for a given team is considered a sample
                Group 1994: 1990 to 1994
                    Team1:
                        feature: array of stats for N seasons (1990 to 1990+N-1)
                        label: stats for season N+1
                    Team2:
                        feature: array of stats for N seasons (1990 to 1990+N-1)
                        label: stats for season N+1
                    ...

                Group 1995: 1991 to 1995
                    Team1
                        feature: array of stats for N seasons (1991 to 1991+N-1)
                        label: stats for season N+1
                    ...
            Then simply concatenate all the teams
        """ 


        # Get all the teams names (10 seconds if full dataset)
        team_names = list(set(df.loc[:, 'team name']))
        #print(team_names)
        #print(len(team_names))

        # Create dictionary with each team as key and an array of all their stats as an entry
        teams = {key: pd.DataFrame([row for idx,row in df.iterrows() if row['team name']==key]) for key in team_names}

        #print(teams)
        #print(len(teams))

        # Create dataset 
        # A data element has features (array of N seasons of a team, where each row is a season) and target (stats for season N+1)
        dataset = []

        # For each team
        for team in teams:
            df = teams[team]

            # Get groups of N seasons
            for season in range(start_season, stop_season-N+2, 1):

                # For the current group of seasons
                features = []
                for s in range(season, season+N):
                    ss = df.loc[df['Season']==s]
                    if ss.empty: # abort if inexistant season
                        break

                    features.append(ss)
                
                # Skip if not enough consecutive seasons (inexistant season)
                if not len(features) == N:
                    # TODO: season+=N for faster iterations?
                    continue

                # Add target
                target = df.loc[df['Season']==season+N]
                if target.empty: # abort if inexistant season
                    continue

                # Add data to dataset
                dataset.append([features, target])

        return dataset

In [127]:
dataset = TeamDataset(df)

#groups = dataset.load_team_data(df, 1990, 2023, 5)
dt = dataset.load_team_data2(df, 1990, 2023, 5)


In [133]:
print(dataset.__len__())
print(dataset.__getitem__(0))

637
([                 team name   PTS%  GF/G  GA/G  Season
418  Columbus Blue Jackets  0.451  2.62  3.37    2006,                  team name   PTS%  GF/G  GA/G  Season
446  Columbus Blue Jackets  0.445  2.39  2.98    2007,                  team name   PTS%  GF/G  GA/G  Season
477  Columbus Blue Jackets  0.488  2.32  2.56    2008,                  team name   PTS%  GF/G  GA/G  Season
498  Columbus Blue Jackets  0.561  2.68  2.72    2009,                  team name   PTS%  GF/G  GA/G  Season
539  Columbus Blue Jackets  0.482  2.61  3.04    2010],                  team name   PTS%  GF/G  GA/G  Season
566  Columbus Blue Jackets  0.494  2.56  3.05    2011)


In [129]:
print(dt)

[[[                 team name   PTS%  GF/G  GA/G  Season
418  Columbus Blue Jackets  0.451  2.62  3.37    2006,                  team name   PTS%  GF/G  GA/G  Season
446  Columbus Blue Jackets  0.445  2.39  2.98    2007,                  team name   PTS%  GF/G  GA/G  Season
477  Columbus Blue Jackets  0.488  2.32  2.56    2008,                  team name   PTS%  GF/G  GA/G  Season
498  Columbus Blue Jackets  0.561  2.68  2.72    2009,                  team name   PTS%  GF/G  GA/G  Season
539  Columbus Blue Jackets  0.482  2.61  3.04    2010],                  team name   PTS%  GF/G  GA/G  Season
566  Columbus Blue Jackets  0.494  2.56  3.05    2011], [[                 team name   PTS%  GF/G  GA/G  Season
446  Columbus Blue Jackets  0.445  2.39  2.98    2007,                  team name   PTS%  GF/G  GA/G  Season
477  Columbus Blue Jackets  0.488  2.32  2.56    2008,                  team name   PTS%  GF/G  GA/G  Season
498  Columbus Blue Jackets  0.561  2.68  2.72    2009,             

In [130]:
print(len(groups)) # Total number of groups (usually END_SEASON-START_SEASON-N (except when the 2005 season is present in the range))
print(len(groups[1994])) # Total number of seasons per group (N)
print(len(groups[1995][1991])) # Total number of teams per season
#print(groups[1995][1991])
print(groups)

25
5
20
{1994: {1990:               team name   PTS%  GF/G  GA/G  Season
0         Boston Bruins  0.631  3.61  2.90    1990
1        Calgary Flames  0.619  4.35  3.31    1990
2        Buffalo Sabres  0.613  3.58  3.10    1990
3    Montreal Canadiens  0.581  3.60  2.93    1990
4       Edmonton Oilers  0.563  3.94  3.54    1990
5    Chicago Blackhawks  0.550  3.95  3.68    1990
6         Winnipeg Jets  0.531  3.73  3.63    1990
7      Hartford Whalers  0.531  3.44  3.35    1990
8      New York Rangers  0.531  3.49  3.34    1990
9     New Jersey Devils  0.519  3.69  3.60    1990
10      St. Louis Blues  0.519  3.69  3.49    1990
11  Toronto Maple Leafs  0.500  4.21  4.48    1990
12  Washington Capitals  0.488  3.55  3.44    1990
14    Los Angeles Kings  0.469  4.23  4.21    1990
15   New York Islanders  0.456  3.51  3.60    1990
16  Pittsburgh Penguins  0.450  3.98  4.49    1990
17  Philadelphia Flyers  0.444  3.63  3.71    1990
18    Detroit Red Wings  0.438  3.60  4.04    1990
19    Van

# Dataloader Class

In [131]:
batch_size = 100

dataset = NewsDataset('articles_norm.csv.gz', token=True, column='content', target='title')
test_length = len(dataset)//10
train_length = len(dataset) - test_length
dataset_test, dataset_train = random_split(dataset, [test_length, train_length])

dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=generate_batch) # collate_fn: call a fnc on a batch just before its output
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=True, collate_fn=generate_batch)

NameError: name 'NewsDataset' is not defined