In [2]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import json
import pandas as pd
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

understat_leagues = {
    "EPL": "https://understat.com/league/EPL/2023",
    "La Liga": "https://understat.com/league/La_liga/2023",
    "Bundesliga": "https://understat.com/league/Bundesliga/2023",
    "Serie A": "https://understat.com/league/Serie_A/2023",
    "Ligue One": "https://understat.com/league/Ligue_1/2023",
}


def fetch_and_parse_html(url: str, timeout: int = 10) -> BeautifulSoup | None:
    """Fetches and parses HTML content from a URL."""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        return BeautifulSoup(
            requests.get(url, headers=headers, timeout=timeout).text, "html.parser"
        )
    except RequestException as e:
        logging.error(f"Error fetching URL {url}: {e}")
        return None


def extract_json(script_content: str) -> dict:
    """Extracts JSON from script content."""
    try:
        start, end = script_content.find("JSON.parse('") + 12, script_content.find(
            "')", script_content.find("JSON.parse('") + 12
        )
        return json.loads(script_content[start:end].encode().decode("unicode_escape"))
    except (ValueError, json.JSONDecodeError) as e:
        logging.error(f"Error extracting JSON: {e}")
        return {}


def get_match_by_match_club_data(url: str) -> pd.DataFrame:
    """Fetches team data, parses it, and returns as a DataFrame."""
    soup = fetch_and_parse_html(url)
    if not soup:
        return pd.DataFrame()
    try:
        script = soup.find_all("script")[2].string
        json_data = extract_json(script)
        return pd.DataFrame(
            [
                {"id": v.get("id"), "title": v.get("title"), **match}
                for v in json_data.values()
                for match in v.get("history", [])
            ]
        )
    except (IndexError, ValueError, KeyError) as e:
        logging.error(f"Error processing team data: {e}")
        return pd.DataFrame()


def get_season_playerdata_(url: str) -> pd.DataFrame:
    """Fetches player data, parses it, and returns as a DataFrame."""
    soup = fetch_and_parse_html(url)
    if not soup:
        return pd.DataFrame()
    try:

        script = soup.find_all("script")[3].string
        json_data = extract_json(script)
        return pd.DataFrame(json_data)
    except (IndexError, ValueError, KeyError) as e:
        logging.error(f"Error processing team data: {e}")
        return pd.DataFrame()


def get_all_leagues_match_data(leagues: dict, season: str = "23/24") -> pd.DataFrame:
    """
    Fetches match-by-match club data for all leagues and combines them into a single DataFrame.
    """
    all_dfs = []
    for league, url in leagues.items():
        try:
            df = get_match_by_match_club_data(url=url)
            df["League"] = league
            df["Season"] = season
            all_dfs.append(df)
        except Exception as e:
            logging.error(f"Failed to fetch or process data for league {league}: {e}")

    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()

In [3]:
all_cubs = get_all_leagues_match_data(leagues=understat_leagues)

In [4]:
french_data = all_cubs[all_cubs['League'] == 'Ligue One']

In [5]:
french_data

Unnamed: 0,id,title,h_a,xG,xGA,npxG,npxGA,ppda,ppda_allowed,deep,...,xpts,result,date,wins,draws,loses,pts,npxGD,League,Season
2892,160,Lille,a,1.534370,1.121410,1.534370,1.121410,"{'att': 289, 'def': 48}","{'att': 347, 'def': 18}",6,...,1.7044,d,2023-08-12 18:00:00,0,1,0,1,0.412960,Ligue One,23/24
2893,160,Lille,h,3.239000,1.271640,3.239000,1.271640,"{'att': 178, 'def': 17}","{'att': 227, 'def': 16}",16,...,2.6098,w,2023-08-20 11:00:00,1,0,0,3,1.967360,Ligue One,23/24
2894,160,Lille,a,1.451270,1.745000,1.451270,1.745000,"{'att': 190, 'def': 29}","{'att': 269, 'def': 27}",13,...,1.1322,l,2023-08-27 15:05:00,0,0,1,0,-0.293730,Ligue One,23/24
2895,160,Lille,h,1.339060,0.793407,1.339060,0.793407,"{'att': 186, 'def': 21}","{'att': 314, 'def': 24}",8,...,1.7839,w,2023-09-03 13:00:00,1,0,0,3,0.545653,Ligue One,23/24
2896,160,Lille,a,1.609030,0.427673,1.609030,0.427673,"{'att': 328, 'def': 37}","{'att': 386, 'def': 22}",6,...,2.4129,d,2023-09-16 15:00:00,0,1,0,1,1.181357,Ligue One,23/24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3499,282,Le Havre,h,0.649411,1.501620,0.649411,1.501620,"{'att': 141, 'def': 13}","{'att': 229, 'def': 21}",5,...,0.6372,l,2024-04-21 13:00:00,0,0,1,0,-0.852209,Ligue One,23/24
3500,282,Le Havre,a,1.212680,2.258290,0.452581,2.258290,"{'att': 314, 'def': 13}","{'att': 98, 'def': 23}",2,...,0.5427,d,2024-04-27 19:00:00,0,1,0,1,-1.805709,Ligue One,23/24
3501,282,Le Havre,h,2.563060,0.631836,2.563060,0.631836,"{'att': 336, 'def': 14}","{'att': 177, 'def': 15}",4,...,2.7516,w,2024-05-04 13:00:00,1,0,0,3,1.931224,Ligue One,23/24
3502,282,Le Havre,a,0.427788,1.936350,0.427788,1.936350,"{'att': 371, 'def': 16}","{'att': 228, 'def': 21}",4,...,0.3180,l,2024-05-10 19:00:00,0,0,1,0,-1.508562,Ligue One,23/24


In [7]:
all_cubs.to_csv("Understat_23_24_Season_Data.csv", index=False)