In [1]:
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import json
import pandas as pd
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

understat_leagues = {
    "EPL": "https://understat.com/league/EPL/2023",
    "La Liga": "https://understat.com/league/La_liga/2023",
    "Bundesliga": "https://understat.com/league/Bundesliga/2023",
    "Serie A": "https://understat.com/league/Serie_A/2023",
    "Ligue One": "https://understat.com/league/Ligue_1/2023",
}


def fetch_and_parse_html(url: str, timeout: int = 10) -> BeautifulSoup | None:
    """Fetches and parses HTML content from a URL."""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        return BeautifulSoup(
            requests.get(url, headers=headers, timeout=timeout).text, "html.parser"
        )
    except RequestException as e:
        logging.error(f"Error fetching URL {url}: {e}")
        return None


def extract_json(script_content: str) -> dict:
    """Extracts JSON from script content."""
    try:
        start, end = script_content.find("JSON.parse('") + 12, script_content.find(
            "')", script_content.find("JSON.parse('") + 12
        )
        return json.loads(script_content[start:end].encode().decode("unicode_escape"))
    except (ValueError, json.JSONDecodeError) as e:
        logging.error(f"Error extracting JSON: {e}")
        return {}


def get_match_by_match_club_data(url: str) -> pd.DataFrame:
    """Fetches team data, parses it, and returns as a DataFrame."""
    soup = fetch_and_parse_html(url)
    if not soup:
        return pd.DataFrame()
    try:
        script = soup.find_all("script")[2].string
        json_data = extract_json(script)
        return pd.DataFrame(
            [
                {"id": v.get("id"), "title": v.get("title"), **match}
                for v in json_data.values()
                for match in v.get("history", [])
            ]
        )
    except (IndexError, ValueError, KeyError) as e:
        logging.error(f"Error processing team data: {e}")
        return pd.DataFrame()


def get_season_playerdata_(url: str) -> pd.DataFrame:
    """Fetches player data, parses it, and returns as a DataFrame."""
    soup = fetch_and_parse_html(url)
    if not soup:
        return pd.DataFrame()
    try:

        script = soup.find_all("script")[3].string
        json_data = extract_json(script)
        return pd.DataFrame(json_data)
    except (IndexError, ValueError, KeyError) as e:
        logging.error(f"Error processing team data: {e}")
        return pd.DataFrame()


def get_all_leagues_match_data(leagues: dict, season: str = "23/24") -> pd.DataFrame:
    """
    Fetches match-by-match club data for all leagues and combines them into a single DataFrame.
    """
    all_dfs = []
    for league, url in leagues.items():
        try:
            df = get_match_by_match_club_data(url=url)
            df["League"] = league
            df["Season"] = season
            all_dfs.append(df)
        except Exception as e:
            logging.error(f"Failed to fetch or process data for league {league}: {e}")

    return pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()

In [2]:
all_cubs = get_all_leagues_match_data(leagues=understat_leagues)

In [6]:
french_data = all_cubs[all_cubs['League'] == 'Ligue One']

In [None]:
french_data