In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm
import os
from datetime import datetime
from datetime import timedelta
from math import isnan

In [2]:
dataFiles = os.listdir("raw-data")
dataFiles.sort()

In [3]:
csvs = ["raw-data/{}".format(tmp) for tmp in [
    '2017-2018-Premier-League-Scores-and-Fixtures',
    '2018-2019-Premier-League-Scores-and-Fixtures',
    '2019-2020-Premier-League-Scores-and-Fixtures',
    '2020-2021-Premier-League-Scores-and-Fixtures',
    '2021-2022-Premier-League-Scores-and-Fixtures',
    '2022-2023-Premier-League-Scores-and-Fixtures',
]]

allGames = pd.concat([pd.read_csv(csv) for csv in csvs])[["Date", "Home", "Away"]]

In [4]:
# Note: this is brittle and may need to change based on who gets promoted into the Premier League
replaceNames = {
    "West Brom": "West Bromwich Albion",
    "Huddersfield": "Huddersfield Town",
    "Brighton": "Brighton and Hove Albion",
    "West Ham": "West Ham United",
    "Manchester Utd": "Manchester United",
    "Tottenham": "Tottenham Hotspur",
    "Newcastle Utd": "Newcastle United",
    "Wolves": "Wolverhampton Wanderers",
    "Sheffield Utd": "Sheffield United"
}


allPlayerStats = dict()

def isValidRow(row):
    return type(allGames.iloc[row]["Home"]) is str


def getTablesForRow(row):
    homeRaw = allGames.iloc[row]["Home"]
    awayRaw = allGames.iloc[row]["Away"]
    dateRaw = allGames.iloc[row]["Date"]
    
    if homeRaw in replaceNames:
        homeRaw = replaceNames[homeRaw]
    
    if awayRaw in replaceNames:
        awayRaw = replaceNames[awayRaw]
    
    home = homeRaw.replace(" ", "-")
    away = awayRaw.replace(" ", "-")
    date = datetime.strptime(dateRaw, '%Y-%m-%d').strftime("%B-%-d-%Y")
    
    substr = "{}-{}-{}".format(home, away, date)
    fileList = ["raw-data/{}".format(tmp) for tmp in filter(lambda x: substr in x, dataFiles)]
    fileList.sort(key = lambda x: int(x.split("-")[-1]))
    return [pd.read_csv(file) for file in fileList]


def getStat(columnName, columnKey, statName, homeTable, row, tables):
    date = allGames.iloc[row]["Date"]

    assert homeTable <= 9
    assert tables[homeTable].iloc[0][columnName] == columnKey, row
    assert tables[homeTable + 7].iloc[0][columnName] == columnKey, row
    
    tmpStat = dict()
    tmpStat["Home {}".format(statName)] = float(tables[homeTable].iloc[-1][columnName])
    tmpStat["Away {}".format(statName)] = float(tables[homeTable + 7].iloc[-1][columnName])
    return tmpStat    


def getAggregateStats(row):
    tables = getTablesForRow(row)
    stats = dict()
    
    # Basic Stats
    stats["Home"] = allGames.iloc[row]["Home"]
    stats["Away"] = allGames.iloc[row]["Away"]
    stats["Date"] = datetime.strptime(allGames.iloc[row]["Date"], '%Y-%m-%d')
    stats.update(getStat("Performance", "Gls", "Goals", 3, row, tables))
    stats.update(getStat("Performance.1", "Ast", "Assists", 3, row, tables))
    stats.update(getStat("Performance.4", "Sh", "Shots Total", 3, row, tables))
    stats.update(getStat("Performance.5", "SoT", "Shots on Target", 3, row, tables))
    stats.update(getStat("SCA", "SCA", "Shot Creating Actions", 3, row, tables))
    stats.update(getStat("SCA.1", "GCA", "Goal Creating Actions", 3, row, tables))
    stats.update(getStat("SCA.1", "GCA", "Goal Creating Actions", 3, row, tables))
    stats.update(getStat("Unnamed: 23_level_0", "KP", "Assisted Shots", 4, row, tables))
    stats.update(getStat("Unnamed: 24_level_0", "1/3", "Passes Into Attacking Third", 4, row, tables))
    stats.update(getStat("Unnamed: 25_level_0", "PPA", "Passes Into Attacking Penalty", 4, row, tables))
    stats.update(getStat("Unnamed: 26_level_0", "CrsPA", "Crosses Into Attacking Penalty", 4, row, tables))
    stats.update(getStat("Touches", "Touches", "Touches", 7, row, tables))
    stats.update(getStat("Touches.1", "Def Pen", "Defensive Penalty Touches", 7, row, tables))
    stats.update(getStat("Touches.2", "Def 3rd", "Defensive Third Touches", 7, row, tables))
    stats.update(getStat("Touches.3", "Mid 3rd", "Middle Third Touches", 7, row, tables))
    stats.update(getStat("Touches.4", "Att 3rd", "Attacking Third Touches", 7, row, tables))
    stats.update(getStat("Touches.5", "Att Pen", "Attacking Penalty Touches", 7, row, tables))
    stats.update(getStat("Dribbles", "Succ", "Dribbles Successful", 7, row, tables))
    stats.update(getStat("Dribbles.1", "Att", "Dribbles Attempted", 7, row, tables))
    stats.update(getStat("Performance.10", "Int", "Interceptions", 3, row, tables))
    stats.update(getStat("Performance.11", "Blocks", "Blocks", 3, row, tables))    
    stats.update(getStat("Passes.3", "Prog", "Progressive Passes Completed", 3, row, tables))
    stats.update(getStat("Passes.1", "Att", "Passes Attempted", 3, row, tables))
    stats.update(getStat("Total.3", "TotDist", "Total Passing Distance", 4, row, tables))
    stats.update(getStat("Total.4", "PrgDist", "Progressive Passing Distance", 4, row, tables))
    stats.update(getStat("Pass Types.5", "Crs", "Corner Kicks", 5, row, tables))
    stats.update(getStat("Pass Types.7", "CK", "Crosses", 5, row, tables))
    stats.update(getStat("Passes", "Cmp", "Pass Completed", 3, row, tables))
    stats.update(getStat("Short", "Cmp", "Short Passes Completed", 4, row, tables))
    stats.update(getStat("Short.1", "Att", "Short Passes Attempted", 4, row, tables))
    stats.update(getStat("Medium", "Cmp", "Medium Passes Completed", 4, row, tables))
    stats.update(getStat("Medium.1", "Att", "Medium Passes Attempted", 4, row, tables))
    stats.update(getStat("Long", "Cmp", "Long Passes Completed", 4, row, tables))
    stats.update(getStat("Long.1", "Att", "Long Passes Attempted", 4, row, tables))
    stats.update(getStat("Tackles", "Tkl", "Tackles", 6, row, tables))
    stats.update(getStat("Tackles.1", "TklW", "Tackles Won", 6, row, tables))
    stats.update(getStat("Performance.3", "Fls", "Fouls", 8, row, tables))
    stats.update(getStat("Performance.12", "Recov", "Loose Ball Recoveries", 8, row, tables))
    stats.update(getStat("Aerial Duels", "Won", "Aerials Won", 8, row, tables))
    stats.update(getStat("Aerial Duels.1", "Lost", "Aerials Lost", 8, row, tables))
    stats.update(getStat("Shot Stopping", "SoTA", "Shots on Target Against", 9, row, tables))
    stats.update(getStat("Shot Stopping.1", "GA", "Goals Against", 9, row, tables))
    
    
    # This is very brittle
    # stats["Home Posession"] = int(tables[2].iloc[1, 1].replace("%", ""))
    # stats["Away Posession"] = int(tables[2].iloc[1, 2].replace("%", ""))
    
    return stats  


In [5]:
statsList = []

for index in tqdm(range(len(allGames))):
    if isValidRow(index):
        statsList.append(getAggregateStats(index))

df = pd.DataFrame(statsList)
df.to_csv("team-data", index=False)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2603/2603 [01:51<00:00, 23.44it/s]


In [6]:
df = pd.read_csv("team-data")
scores = df[["Home", "Away", "Date", "Home Goals", "Away Goals"]].copy(deep=True)

home = df[[col for col in df.columns if (col == "Date" or col.startswith("Home"))]].copy(deep=True)
home.rename(mapper = lambda x: x.replace("Home", "Team"), inplace=True, axis=1)
away = df[[col for col in df.columns if (col == "Date" or col.startswith("Away"))]].copy(deep=True)
away.rename(mapper = lambda x: x.replace("Away", "Team"), inplace=True, axis=1)
data = pd.concat([home, away]).sort_values(["Team", "Date"])
data["Date"] = data["Date"].astype("datetime64[ns]")

    
statsList = list(filter(lambda x: x.startswith("Team "), data.columns))

In [7]:
# Compute rolling window stats, normalized to [0,1]
def getWindow(windowSize, data):
    print("Computing {}-game rolling window stats".format(windowSize))
    allDicts = list()
    
    for index in tqdm(range(len(data))):
        row = data.iloc[index]
        team = row["Team"]
        date = row["Date"]
        dateEnd = date
        dateStart = dateEnd - timedelta(days=90)
        before = data[(data["Team"] == team) & (data["Date"] < dateEnd) & (data["Date"] > dateStart)]
        beforeTail = before.tail(windowSize)
        if len(beforeTail) == windowSize:
            tmpDict = dict()
            tmpDict["Team"] = team
            tmpDict["Date"] = date
            for column in statsList:
                tmpDict[column.replace("Team", "{} Game Rolling Window".format(windowSize))] = beforeTail[column].sum()
            allDicts.append(tmpDict)
        
    df = pd.DataFrame(allDicts)
    return df


def windowToInputsOutputsCsv(data, scores, windowSize):
    print("Converting {}-game rolling window stats into input DataFrame".format(windowSize))
    tmpInputs = list()
    tmpOutputs = list()
    statsList = list(filter(lambda x: x.startswith(('1', '2', '3', '4', '5')), data.columns))

    for index in tqdm(range(len(scores))):
        game = scores.iloc[index]
        date = game["Date"]
        home = game["Home"]
        away = game["Away"]
        homeGoals = game["Home Goals"]
        awayGoals = game["Away Goals"]
        homeWin = homeGoals > awayGoals
        draw = homeGoals == awayGoals
        result = "Home Win" if homeWin else "Draw" if draw else "Home Loss"

        homeStats = data[(data["Date"] == date) & (data["Team"] == home)]
        awayStats = data[(data["Date"] == date) & (data["Team"] == away)]
        if len(homeStats) != 1 or len(awayStats) != 1:
            continue
        tmp = dict()
        tmp["Date"] = date
        tmp["Home"] = home
        tmp["Away"] = away

        for stat in statsList:
            tmp["{} Difference".format(stat)] = homeStats.iloc[0][stat] - awayStats.iloc[0][stat]

        tmpInputs.append(tmp)
        tmpOutputs.append({"Date": date, 
                           "Home": home, 
                           "Away": away, 
                           "Result": result,
                           "Goal Difference": homeGoals - awayGoals
                          })

    inputs = pd.DataFrame(tmpInputs)
    outputs = pd.DataFrame(tmpOutputs)
    inputs.to_csv("inputs-{}-window".format(windowSize), index=False)
    outputs.to_csv("outputs-{}-window".format(windowSize), index=False)


In [8]:
windowSize = 5
windowToInputsOutputsCsv(getWindow(windowSize, data), scores, windowSize)

Computing 5-game rolling window stats


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4560/4560 [00:13<00:00, 342.67it/s]


Converting 5-game rolling window stats into input DataFrame


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2280/2280 [00:12<00:00, 182.97it/s]
