In [1]:
from understat import Understat
import aiohttp
import asyncio
import nest_asyncio
import pandas as pd

In [2]:
nest_asyncio.apply()

async def get_understat_data():
    async with aiohttp.ClientSession() as session:
        understat = Understat(session)
        leagues = ["EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1"]
        seasons = list(range(2017, 2025))
        all_data = []

        for league in leagues:
            for season in seasons:
                try:
                    matches = await understat.get_league_results(league, season)
                    df = pd.DataFrame(matches)
                    df["league"] = league
                    df["season"] = season
                    all_data.append(df)
                    print(f"‚úÖ {league} {season}: {len(df)} matches")
                except Exception as e:
                    print(f"‚ö†Ô∏è {league} {season}: {e}")

        return pd.concat(all_data, ignore_index=True)

data = asyncio.run(get_understat_data())
data.to_csv("../data/raw/understat_xg_data.csv", index=False)
print("üíæ Saved Understat data!")

‚úÖ EPL 2017: 380 matches
‚úÖ EPL 2018: 380 matches
‚úÖ EPL 2019: 380 matches
‚úÖ EPL 2020: 380 matches
‚úÖ EPL 2021: 380 matches
‚úÖ EPL 2022: 380 matches
‚úÖ EPL 2023: 380 matches
‚úÖ EPL 2024: 380 matches
‚úÖ La_Liga 2017: 380 matches
‚úÖ La_Liga 2018: 380 matches
‚úÖ La_Liga 2019: 380 matches
‚úÖ La_Liga 2020: 380 matches
‚úÖ La_Liga 2021: 380 matches
‚úÖ La_Liga 2022: 380 matches
‚úÖ La_Liga 2023: 380 matches
‚úÖ La_Liga 2024: 380 matches
‚úÖ Bundesliga 2017: 306 matches
‚úÖ Bundesliga 2018: 306 matches
‚úÖ Bundesliga 2019: 306 matches
‚úÖ Bundesliga 2020: 306 matches
‚úÖ Bundesliga 2021: 306 matches
‚úÖ Bundesliga 2022: 306 matches
‚úÖ Bundesliga 2023: 306 matches
‚úÖ Bundesliga 2024: 306 matches
‚úÖ Serie_A 2017: 380 matches
‚úÖ Serie_A 2018: 380 matches
‚úÖ Serie_A 2019: 380 matches
‚úÖ Serie_A 2020: 380 matches
‚úÖ Serie_A 2021: 380 matches
‚úÖ Serie_A 2022: 380 matches
‚úÖ Serie_A 2023: 380 matches
‚úÖ Serie_A 2024: 380 matches
‚úÖ Ligue_1 2017: 380 matches
‚úÖ Ligue_1 2018: 

In [5]:
df = pd.read_csv(r"../data/raw/understat_xg_data.csv")

In [6]:
df.head()

Unnamed: 0,id,isResult,h,a,goals,xG,datetime,forecast,league,season
0,7119,True,"{'id': '83', 'title': 'Arsenal', 'short_title'...","{'id': '75', 'title': 'Leicester', 'short_titl...","{'h': '4', 'a': '3'}","{'h': '2.54329', 'a': '1.46495'}",2017-08-11 19:45:00,"{'w': '0.628', 'd': '0.2154', 'l': '0.1566'}",EPL,2017
1,7120,True,"{'id': '90', 'title': 'Watford', 'short_title'...","{'id': '87', 'title': 'Liverpool', 'short_titl...","{'h': '3', 'a': '3'}","{'h': '2.17647', 'a': '2.61549'}",2017-08-12 12:30:00,"{'w': '0.2452', 'd': '0.287', 'l': '0.4678'}",EPL,2017
2,7121,True,"{'id': '76', 'title': 'West Bromwich Albion', ...","{'id': '73', 'title': 'Bournemouth', 'short_ti...","{'h': '1', 'a': '0'}","{'h': '1.18399', 'a': '0.378659'}",2017-08-12 15:00:00,"{'w': '0.5875', 'd': '0.3056', 'l': '0.1069'}",EPL,2017
3,7122,True,"{'id': '74', 'title': 'Southampton', 'short_ti...","{'id': '84', 'title': 'Swansea', 'short_title'...","{'h': '0', 'a': '0'}","{'h': '2.21748', 'a': '0.406196'}",2017-08-12 15:00:00,"{'w': '0.8262', 'd': '0.1377', 'l': '0.0361'}",EPL,2017
4,7123,True,"{'id': '80', 'title': 'Chelsea', 'short_title'...","{'id': '92', 'title': 'Burnley', 'short_title'...","{'h': '2', 'a': '3'}","{'h': '1.35651', 'a': '0.564237'}",2017-08-12 15:00:00,"{'w': '0.5899', 'd': '0.284', 'l': '0.1261'}",EPL,2017


In [7]:
import ast

# Convert stringified dicts into real dicts
for col in ["h", "a", "goals", "xG", "forecast"]:
    df[col] = df[col].apply(ast.literal_eval)

In [8]:
df["home_team"] = df["h"].apply(lambda x: x["title"])
df["away_team"] = df["a"].apply(lambda x: x["title"])

df["home_goals"] = df["goals"].apply(lambda x: int(x["h"]))
df["away_goals"] = df["goals"].apply(lambda x: int(x["a"]))

df["home_xg"] = df["xG"].apply(lambda x: float(x["h"]))
df["away_xg"] = df["xG"].apply(lambda x: float(x["a"]))

In [9]:
df.head()

Unnamed: 0,id,isResult,h,a,goals,xG,datetime,forecast,league,season,home_team,away_team,home_goals,away_goals,home_xg,away_xg
0,7119,True,"{'id': '83', 'title': 'Arsenal', 'short_title'...","{'id': '75', 'title': 'Leicester', 'short_titl...","{'h': '4', 'a': '3'}","{'h': '2.54329', 'a': '1.46495'}",2017-08-11 19:45:00,"{'w': '0.628', 'd': '0.2154', 'l': '0.1566'}",EPL,2017,Arsenal,Leicester,4,3,2.54329,1.46495
1,7120,True,"{'id': '90', 'title': 'Watford', 'short_title'...","{'id': '87', 'title': 'Liverpool', 'short_titl...","{'h': '3', 'a': '3'}","{'h': '2.17647', 'a': '2.61549'}",2017-08-12 12:30:00,"{'w': '0.2452', 'd': '0.287', 'l': '0.4678'}",EPL,2017,Watford,Liverpool,3,3,2.17647,2.61549
2,7121,True,"{'id': '76', 'title': 'West Bromwich Albion', ...","{'id': '73', 'title': 'Bournemouth', 'short_ti...","{'h': '1', 'a': '0'}","{'h': '1.18399', 'a': '0.378659'}",2017-08-12 15:00:00,"{'w': '0.5875', 'd': '0.3056', 'l': '0.1069'}",EPL,2017,West Bromwich Albion,Bournemouth,1,0,1.18399,0.378659
3,7122,True,"{'id': '74', 'title': 'Southampton', 'short_ti...","{'id': '84', 'title': 'Swansea', 'short_title'...","{'h': '0', 'a': '0'}","{'h': '2.21748', 'a': '0.406196'}",2017-08-12 15:00:00,"{'w': '0.8262', 'd': '0.1377', 'l': '0.0361'}",EPL,2017,Southampton,Swansea,0,0,2.21748,0.406196
4,7123,True,"{'id': '80', 'title': 'Chelsea', 'short_title'...","{'id': '92', 'title': 'Burnley', 'short_title'...","{'h': '2', 'a': '3'}","{'h': '1.35651', 'a': '0.564237'}",2017-08-12 15:00:00,"{'w': '0.5899', 'd': '0.284', 'l': '0.1261'}",EPL,2017,Chelsea,Burnley,2,3,1.35651,0.564237


In [10]:
df["date"] = pd.to_datetime(df["datetime"]).dt.date
df["season"] = df["season"].astype(str)

df_clean = df[[
    "date", "home_team", "away_team",
    "home_goals", "away_goals",
    "home_xg", "away_xg",
    "league", "season"
]]

In [12]:
df_clean.head()

Unnamed: 0,date,home_team,away_team,home_goals,away_goals,home_xg,away_xg,league,season
0,2017-08-11,Arsenal,Leicester,4,3,2.54329,1.46495,EPL,2017
1,2017-08-12,Watford,Liverpool,3,3,2.17647,2.61549,EPL,2017
2,2017-08-12,West Bromwich Albion,Bournemouth,1,0,1.18399,0.378659,EPL,2017
3,2017-08-12,Southampton,Swansea,0,0,2.21748,0.406196,EPL,2017
4,2017-08-12,Chelsea,Burnley,2,3,1.35651,0.564237,EPL,2017


In [13]:
df_clean.to_csv("../data/raw/understat_xg_data_clean.csv", index=False)

In [14]:
clean_xg = pd.read_csv("../data/raw/understat_xg_data_clean.csv")

In [15]:
clean_xg.head()

Unnamed: 0,date,home_team,away_team,home_goals,away_goals,home_xg,away_xg,league,season
0,2017-08-11,Arsenal,Leicester,4,3,2.54329,1.46495,EPL,2017
1,2017-08-12,Watford,Liverpool,3,3,2.17647,2.61549,EPL,2017
2,2017-08-12,West Bromwich Albion,Bournemouth,1,0,1.18399,0.378659,EPL,2017
3,2017-08-12,Southampton,Swansea,0,0,2.21748,0.406196,EPL,2017
4,2017-08-12,Chelsea,Burnley,2,3,1.35651,0.564237,EPL,2017
