### Download Prem Data

Useing the [football-data.co.uk](https://www.football-data.co.uk/englandm.php) and ~~pandas~~ polars to load the Premier League results for the 2021/22, 2022,2023 and 2023/2024 seasons.

In [115]:
import torch
import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import posteriors
from datetime import datetime


def download_data(start=21, end=24):
    urls = [
        f"https://www.football-data.co.uk/mmz4281/{y}{y+1}/E0.csv"
        for y in range(start, end)
    ]

    origin_date = datetime.strptime(f"20{start}-08-01", "%Y-%m-%d").date()
    data = pl.concat(pl.read_csv(url) for url in urls)

    data = data.drop_nulls()
    data = data.with_columns(
        pl.col("Date").str.to_date().alias("Timestamp")
    )

    data = data.with_columns(
        (pl.col("Timestamp") - origin_date)
        .dt.total_days().alias("TimestampDays")
    )

    players_arr = pl.concat([data["HomeTeam"], data["AwayTeam"]]).unique().to_numpy()
    players_arr.sort()
    players_name_to_id_dict = {a: i for i, a in enumerate(players_arr)}
    players_id_to_name_dict = {i: a for i, a in enumerate(players_arr)}

    data = data.with_columns(
        pl.col("HomeTeam", "AwayTeam")
        .replace_strict(players_name_to_id_dict).name.suffix("ID")
    )
    
    match_times = torch.tensor(data["TimestampDays"].to_numpy(), dtype=torch.float64)
    match_player_indices = torch.tensor(data["HomeTeamID", "AwayTeamID"].to_numpy())

    home_goals = torch.tensor(data["FTHG"].to_numpy())
    away_goals = torch.tensor(data["FTAG"].to_numpy())
    
    match_results = torch.where(
        home_goals > away_goals, 1, torch.where(home_goals < away_goals, 2, 0)
    )

    dataset = torch.utils.data.StackDataset(
        match_times=match_times,
        match_player_indices=match_player_indices,
        match_results=match_results,
    )

    return (
        dataset,
        players_id_to_name_dict,
        players_name_to_id_dict,
    )
    
    
 
download_data()

(<torch.utils.data.dataset.StackDataset at 0x2a568a7d1f0>,
 {0: 'Arsenal',
  1: 'Aston Villa',
  2: 'Bournemouth',
  3: 'Brentford',
  4: 'Brighton',
  5: 'Burnley',
  6: 'Chelsea',
  7: 'Crystal Palace',
  8: 'Everton',
  9: 'Fulham',
  10: 'Leeds',
  11: 'Leicester',
  12: 'Liverpool',
  13: 'Luton',
  14: 'Man City',
  15: 'Man United',
  16: 'Newcastle',
  17: 'Norwich',
  18: "Nott'm Forest",
  19: 'Sheffield United',
  20: 'Southampton',
  21: 'Tottenham',
  22: 'Watford',
  23: 'West Ham',
  24: 'Wolves'},
 {'Arsenal': 0,
  'Aston Villa': 1,
  'Bournemouth': 2,
  'Brentford': 3,
  'Brighton': 4,
  'Burnley': 5,
  'Chelsea': 6,
  'Crystal Palace': 7,
  'Everton': 8,
  'Fulham': 9,
  'Leeds': 10,
  'Leicester': 11,
  'Liverpool': 12,
  'Luton': 13,
  'Man City': 14,
  'Man United': 15,
  'Newcastle': 16,
  'Norwich': 17,
  "Nott'm Forest": 18,
  'Sheffield United': 19,
  'Southampton': 20,
  'Tottenham': 21,
  'Watford': 22,
  'West Ham': 23,
  'Wolves': 24})

In [110]:

def download_data(start=21, end=24):
    urls = [
        f"https://www.football-data.co.uk/mmz4281/{y}{y+1}/E0.csv"
        for y in range(start, end)
    ]

    origin_date = pd.to_datetime(f"20{start}-08-01")
    data = pd.concat(pd.read_csv(url) for url in urls)
    data.dropna()
       
    data["Timestamp"] = pd.to_datetime(data["Date"], dayfirst=True)
    data["Timestamp"] = pd.to_datetime(data["Timestamp"], unit="D")
    data["TimestampDays"] = (data["Timestamp"] - origin_date).dt.days.astype(int)

    players_arr = pd.unique(
                  pd.concat([data["HomeTeam"], data["AwayTeam"]]))
    players_arr.sort()
    players_name_to_id_dict = {a: i for i, a in enumerate(players_arr)}
    players_id_to_name_dict = {i: a for i, a in enumerate(players_arr)}

    data["HomeTeamID"] = data["HomeTeam"].apply(lambda s: players_name_to_id_dict[s])
    data["AwayTeamID"] = data["AwayTeam"].apply(lambda s: players_name_to_id_dict[s])

    match_times = torch.tensor(data["TimestampDays"].to_numpy(), dtype=torch.float64)
    print(data[["HomeTeamID", "AwayTeamID"]].to_numpy())
    print(data[["HomeTeamID", "AwayTeamID"]])
    match_player_indices = torch.tensor(data[["HomeTeamID", "AwayTeamID"]].to_numpy())

    #home_goals = torch.tensor(data["FTHG"].to_numpy())
    #away_goals = torch.tensor(data["FTAG"].to_numpy())

    #match_results = torch.where(
    #    home_goals > away_goals, 1, torch.where(home_goals < away_goals, 2, 0)
    #)

    #dataset = torch.utils.data.StackDataset(
    #    match_times=match_times,
    #    match_player_indices=match_player_indices,
    #    match_results=match_results,
    #)

    #return (
    #    dataset,
    #    players_id_to_name_dict,
    #    players_name_to_id_dict,
    #)
    
    
 
download_data()

[[ 3  0]
 [15 10]
 [ 5  4]
 ...
 [13  9]
 [14 23]
 [19 21]]
     HomeTeamID  AwayTeamID
0             3           0
1            15          10
2             5           4
3             6           7
4             8          20
..          ...         ...
375           7           1
376          12          24
377          13           9
378          14          23
379          19          21

[1140 rows x 2 columns]
