# Data Scraping & Cleaning


## Imports

In [1]:
import json
import locale
import sys
from ast import literal_eval

import pandas as pd
import soccerdata as sd
from tqdm import tqdm

sys.path.append(
    r"C:\Users\Vitor\Desktop\Football Data Analytics\My_Projects\Analysis Tools"
)
import function_town as ft

## ⚠️ Important Disclaimer

To ensure the workflow below functions correctly, **a minor modification to the `whoscored.py` file from the `soccerdata` package is required**.  
Rather than forking or altering the original repository, I’ll describe the necessary changes here.

> **Full credit** goes to the original creator of the code, [Pieter Robberechts](https://x.com/p_robberechts).

### Required Changes

You need to update the following in your local `whoscored.py` file:
- The `COLS_EVENT` dictionary  
- The `read_evnts()` function

The updated code for these elements is provided below.

In [None]:
COLS_EVENTS = {
    # The ID of the game
    "game_id": np.nan,
    # 'PreMatch', 'FirstHalf', 'SecondHalf', 'PostGame'
    "period": np.nan,
    # Integer indicating the minute of the event, ignoring stoppage time
    "minute": -1,
    # Integer indicating the second of the event, ignoring stoppage time
    "second": -1,
    # Integer indicating the minute of the event, taking into account stoppage time
    "expanded_minute": -1,
    # String describing the event type (e.g. 'Goal', 'Yellow Card', etc.)
    "type": np.nan,
    # String describing the event outcome ('Succesful' or 'Unsuccessful')
    "outcome_type": np.nan,
    # The ID of the team that the event is associated with
    "team_id": np.nan,
    # The name of the team that the event is associated with
    "team": np.nan,
    # The ID of the player that the event is associated with
    "player_id": np.nan,
    # The name of the player that the event is associated with
    "player": np.nan,
    # Coordinates of the event's location
    "x": np.nan,
    "y": np.nan,
    "end_x": np.nan,
    "end_y": np.nan,
    # Coordinates of a shot's location
    "goal_mouth_y": np.nan,
    "goal_mouth_z": np.nan,
    # The coordinates where the ball was blocked
    "blocked_x": np.nan,
    "blocked_y": np.nan,
    # List of dicts with event qualifiers
    "qualifiers": [],
    # List of integers with satisfied Events Types
    "satisfied_events_types": np.nan, # this is new to this code
    # Some boolean flags
    "is_touch": False,
    "is_shot": False,
    "is_goal": False,
    # 'Yellow', 'Red', 'SecondYellow'
    "card_type": np.nan,
    # The ID of an associated event
    "related_event_id": np.nan,
    # The ID of a secondary player that the event is associated with
    "related_player_id": np.nan,
}

In [None]:
def read_events(  # noqa: C901
    self,
    match_id: Optional[Union[int, list[int]]] = None,
    force_cache: bool = False,
    live: bool = False,
    output_fmt: Optional[str] = "events",
    retry_missing: bool = True,
    on_error: Literal["raise", "skip"] = "raise",
) -> Optional[Union[pd.DataFrame, dict[int, list], "OptaLoader"]]:  # type: ignore  # noqa: F821
    """Retrieve the the event data for each game in the selected leagues and seasons.

    Parameters
    ----------
    match_id : int or list of int, optional
        Retrieve the event stream for a specific game.
    force_cache : bool
        By default no cached data is used to scrape the list of available
        games for the current season. If True, will force the use of
        cached data anyway.
    live : bool
        If True, will not return a cached copy of the event data. This is
        usefull to scrape live data.
    output_fmt : str, default: 'events'
        The output format of the returned data. Possible values are:
            - 'events' (default): Returns a dataframe with all events.
            - 'raw': Returns the original unformatted WhoScored JSON.
            - 'spadl': Returns a dataframe with the SPADL representation
            of the original events.
            See https://socceraction.readthedocs.io/en/latest/documentation/SPADL.html#spadl
            - 'atomic-spadl': Returns a dataframe with the Atomic-SPADL representation
            of the original events.
            See https://socceraction.readthedocs.io/en/latest/documentation/SPADL.html#atomic-spadl
            - 'loader': Returns a socceraction.data.opta.OptaLoader
            instance, which can be used to retrieve the actual data.
            See https://socceraction.readthedocs.io/en/latest/modules/generated/socceraction.data.opta.OptaLoader.html#socceraction.data.opta.OptaLoader
            - None: Doesn't return any data. This is useful to just cache
            the data without storing the events in memory.
    retry_missing : bool
        If no events were found for a game in a previous attempt, will
        retry to scrape the events
    on_error : "raise" or "skip", default: "raise"
        Wheter to raise an exception or to skip the game if an error occurs.

    Raises
    ------
    ValueError
        If the given match_id could not be found in the selected seasons.
    ConnectionError
        If the match page could not be retrieved.
    ImportError
        If the requested output format is 'spadl', 'atomic-spadl' or
        'loader' but the socceraction package is not installed.

    Returns
    -------
    See the description of the ``output_fmt`` parameter.
    """
    output_fmt = output_fmt.lower() if output_fmt is not None else None
    if output_fmt in ["loader", "spadl", "atomic-spadl"]:
        if self.no_store:
            raise ValueError(
                f"The '{output_fmt}' output format is not supported "
                "when using the 'no_store' option."
            )
        try:
            from socceraction.atomic.spadl import convert_to_atomic
            from socceraction.data.opta import OptaLoader
            from socceraction.data.opta.loader import _eventtypesdf
            from socceraction.data.opta.parsers import WhoScoredParser
            from socceraction.spadl.opta import convert_to_actions

            if output_fmt == "loader":
                import socceraction
                from packaging import version

                if version.parse(socceraction.__version__) < version.parse("1.2.3"):
                    raise ImportError(
                        "The 'loader' output format requires socceraction >= 1.2.3"
                    )
        except ImportError:
            raise ImportError(
                "The socceraction package is required to use the 'spadl' "
                "or 'atomic-spadl' output format. "
                "Please install it with `pip install socceraction`."
            )
    urlmask = WHOSCORED_URL + "/Matches/{}/Live"
    filemask = "events/{}_{}/{}.json"

    df_schedule = self.read_schedule(force_cache).reset_index()
    if match_id is not None:
        iterator = df_schedule[
            df_schedule.game_id.isin([match_id] if isinstance(match_id, int) else match_id)
        ]
        if len(iterator) == 0:
            raise ValueError("No games found with the given IDs in the selected seasons.")
    else:
        iterator = df_schedule.sample(frac=1)

    events = {}
    player_names = {}
    team_names = {}
    for i, (_, game) in enumerate(iterator.iterrows()):
        url = urlmask.format(game["game_id"])
        # get league and season
        logger.info(
            "[%s/%s] Retrieving game with id=%s",
            i + 1,
            len(iterator),
            game["game_id"],
        )
        filepath = self.data_dir / filemask.format(
            game["league"], game["season"], game["game_id"]
        )

        try:
            reader = self.get(
                url,
                filepath,
                var="require.config.params['args'].matchCentreData",
                no_cache=live,
            )
            reader_value = reader.read()
            if retry_missing and reader_value == b"null" or reader_value == b"":
                reader = self.get(
                    url,
                    filepath,
                    var="require.config.params['args'].matchCentreData",
                    no_cache=True,
                )
        except ConnectionError as e:
            if on_error == "skip":
                logger.warning("Error while scraping game %s: %s", game["game_id"], e)
                continue
            raise
        reader.seek(0)
        json_data = json.load(reader)
        if json_data is not None:
            player_names.update(
                {int(k): v for k, v in json_data["playerIdNameDictionary"].items()}
            )
            team_names.update(
                {
                    int(json_data[side]["teamId"]): json_data[side]["name"]
                    for side in ["home", "away"]
                }
            )
            if "events" in json_data:
                game_events = json_data["events"]
                if output_fmt == "events":
                    df_events = pd.DataFrame(game_events)
                    # Add satisfiedEventsTypes from raw JSON events
                    df_events["satisfiedEventsTypes"] = [list(map(int, e.get("satisfiedEventsTypes", []))) for e in game_events]
                    df_events["game"] = game["game"]
                    df_events["league"] = game["league"]
                    df_events["season"] = game["season"]
                    df_events["game_id"] = game["game_id"]
                    events[game["game_id"]] = df_events
                elif output_fmt == "raw":
                    events[game["game_id"]] = game_events
                elif output_fmt in ["spadl", "atomic-spadl"]:
                    parser = WhoScoredParser(
                        str(filepath),
                        competition_id=game["league"],
                        season_id=game["season"],
                        game_id=game["game_id"],
                    )
                    df_events = (
                        pd.DataFrame.from_dict(parser.extract_events(), orient="index")
                        .merge(_eventtypesdf, on="type_id", how="left")
                        .reset_index(drop=True)
                    )
                    df_actions = convert_to_actions(
                        df_events, home_team_id=int(json_data["home"]["teamId"])
                    )
                    if output_fmt == "spadl":
                        events[game["game_id"]] = df_actions
                    else:
                        events[game["game_id"]] = convert_to_atomic(df_actions)

        else:
            logger.warning("No events found for game %s", game["game_id"])

    if output_fmt is None:
        return None

    if output_fmt == "raw":
        return events

    if output_fmt == "loader":
        return OptaLoader(
            root=self.data_dir,
            parser="whoscored",
            feeds={
                "whoscored": str(Path("events/{competition_id}_{season_id}/{game_id}.json"))
            },
        )

    if len(events) == 0:
        return pd.DataFrame(index=["league", "season", "game"])

    df = (
        pd.concat(events.values())
        .pipe(standardize_colnames)
        .assign(
            player=lambda x: x.player_id.replace(player_names),
            team=lambda x: x.team_id.replace(team_names).replace(TEAMNAME_REPLACEMENTS),
        )
    )

    if output_fmt == "events":
        df = df.set_index(["league", "season", "game"]).sort_index()
        # add missing columns
        for col, default in COLS_EVENTS.items():
            if col not in df.columns and col != "satisfiedEventsTypes":
                df[col] = default
        df["outcome_type"] = df["outcome_type"].apply(
            lambda x: x.get("displayName") if pd.notnull(x) else x
        )
        df["card_type"] = df["card_type"].apply(
            lambda x: x.get("displayName") if pd.notnull(x) else x
        )
        df["type"] = df["type"].apply(lambda x: x.get("displayName") if pd.notnull(x) else x)
        df["period"] = df["period"].apply(
            lambda x: x.get("displayName") if pd.notnull(x) else x
        )
        df = df[list(COLS_EVENTS.keys())]

    return df

This function below will allow us to scrape event data from WhoScored, through the soccerrdata package. It will also take the df resulting from the scraping and clean it by changing the qualifiers and satisfied_events_types columns.
Functions like *transform_qualifiers* and *map_qualifier_codes* can be found in the Analyssis Tools folder, in the "function_town.py" file.

In [2]:
def scrape_and_clean_whoscored_event_data(
    league_name: str,  # Choose from sd.WhoScored.available_leagues()
    season: int,  # Season = 2024 refers to the current 2024/2025 season
) -> pd.DataFrame:
    """
    Scrape and clean event data from WhoScored for a specific league and season. Clean it and store it in a Parquet file for future use

    Parameters:
        league_name (str): Name of the league to scrape.
        season (int): Season year (e.g., 2024 for 2024/2025 season).
        cache_path (str): Path to save the cached data.

    Returns:
        pd.DataFrame: Cleaned event data.
    """
    storage_path = r"C:\Users\Vitor\Desktop\Football Data Analytics\My_Projects\Main level\data"  # Change path as needed

    locale.setlocale(locale.LC_TIME, "en_US.UTF-8")

    ws = sd.WhoScored(
        leagues=league_name,
        seasons=season,
        no_cache=False,  # Change as needed. Ideally False, because it will used cached data if available. This speeds up the prrocess and puts less of a strain on the platform's servers
        no_store=False,  # Change as needed. False because in case there's new data, I want to download and store it
        headless=False,  # Change as needed. Just in case, to avoid loking too much like a bot
    )

    locale.setlocale(locale.LC_TIME, "en_US.UTF-8")

    events = ws.read_events(
        force_cache=True, retry_missing=True, output_fmt="events"
    )  # By default no cached data is used to scrape the list of available games for the current season. If True, will force the use of cached data anyway; retry_missing: If no events were found for a game in a previous attempt, will retry to scrape the events;

    events.reset_index(inplace=True)

    events["qualifiers"] = [
        literal_eval(x) if isinstance(x, str) else x for x in events["qualifiers"]
    ]
    events["satisfied_events_types"] = [
        literal_eval(x) if isinstance(x, str) else x
        for x in events["satisfied_events_types"]
    ]

    events["qualifiers"] = events["qualifiers"].apply(ft.transform_qualifiers)
    """
    Transform a list (or array) of qualifier dictionaries from the original format:
      { "type": {"displayName": "Length", "value": 212}, "value": "11.5" }
    into a simplified list with keys in PascalCase.

    For example, a row like:
      [{"type": {"displayName": "Standing Save", "value": 999}, "value": ""},
       {"type": {"displayName": "Pass End X", "value": 140}, "value": "30.6"}]
    becomes:
      [{'StandingSave': True}, {'PassEndX': '30.6'}]
    """

    events = ft.map_qualifier_codes(events, "satisfied_events_types")
    """
    Replace lists of integer codes in a DataFrame column with corresponding string labels
    using the global optaQualifierCodes dictionary. Warns if row is not a list.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - column_name (str): Column with lists of integers.

    Returns:
    - pd.DataFrame: Modified copy of the DataFrame.
    """

    # Convert complex columns to JSON strings before saving
    events["qualifiers"] = events["qualifiers"].apply(json.dumps)
    events["satisfied_events_types"] = events["satisfied_events_types"].apply(
        json.dumps
    )

    # Save to Parquet
    parquet_path = (
        f"{storage_path}/Event_data_{league_name.replace(' ', '_')}_{season}.parquet"
    )
    events.to_parquet(parquet_path)

    return events

In [None]:
# Let's check the available leagues to scrape
locale.setlocale(
    locale.LC_TIME, "en_US.UTF-8"
)  # I use this line when using the soccerdata package to avoid issues with date formats


sd.WhoScored.available_leagues()

The code above provides access to more leagues than the default configuration of the `soccerdata` package. This is possible because I customized the `league_dict.json` file, following the official documentation: [How to Add Custom Leagues](https://soccerdata.readthedocs.io/en/latest/howto/custom-leagues.html).

The modified `league_dict.json` file is available in the **Analysis Tools** folder. Additionally, you’ll find a customized version of the `teamname_replacements.json` file, which I adapted from the original to better handle inconsistencies in team naming across different data sources.

## Event data: Liga Portugal 24/25


In [7]:
scrape_and_clean_whoscored_event_data(league_name="PRT-Liga Portugal", season=2024)





Processing: 100%|██████████| 7/7 [09:21<00:00, 80.23s/step]


Unnamed: 0,league,season,game,game_id,period,minute,second,expanded_minute,type,outcome_type,...,blocked_x,blocked_y,qualifiers,satisfied_events_types,is_touch,is_shot,is_goal,card_type,related_event_id,related_player_id
0,PRT-Liga Portugal,2425,2024-08-09 Sporting CP-Rio Ave,1836567,FirstHalf,0,0.0,0,Start,Successful,...,,,[],[],False,,,,,
1,PRT-Liga Portugal,2425,2024-08-09 Sporting CP-Rio Ave,1836567,FirstHalf,0,0.0,0,Start,Successful,...,,,[],[],False,,,,,
2,PRT-Liga Portugal,2425,2024-08-09 Sporting CP-Rio Ave,1836567,FirstHalf,0,0.0,0,Pass,Successful,...,,,"[{""Length"": ""11.5""}, {""Zone"": ""Back""}, {""Passe...","[""touches"", ""passAccurate"", ""shortPassAccurate...",True,,,,,
3,PRT-Liga Portugal,2425,2024-08-09 Sporting CP-Rio Ave,1836567,FirstHalf,0,3.0,0,Pass,Unsuccessful,...,,,"[{""Angle"": ""0.80""}, {""Length"": ""35.5""}, {""Long...","[""touches"", ""passInaccurate"", ""passChipped"", ""...",True,,,,,
4,PRT-Liga Portugal,2425,2024-08-09 Sporting CP-Rio Ave,1836567,FirstHalf,0,5.0,0,Pass,Unsuccessful,...,,,"[{""Angle"": ""6.01""}, {""Passendx"": ""45.7""}, {""He...","[""touches"", ""passBackZoneInaccurate"", ""passIna...",True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376371,PRT-Liga Portugal,2425,2025-04-12 Santa Clara-Sporting CP,1836751,SecondHalf,95,44.0,98,End,Successful,...,,,[],[],False,,,,,
376372,PRT-Liga Portugal,2425,2025-04-12 Santa Clara-Sporting CP,1836751,PostGame,0,0.0,8,End,Successful,...,,,[],[],False,,,,,
376373,PRT-Liga Portugal,2425,2025-04-12 Santa Clara-Sporting CP,1836751,PostGame,0,0.0,8,End,Successful,...,,,[],[],False,,,,,
376374,PRT-Liga Portugal,2425,2025-04-12 Santa Clara-Sporting CP,1836751,PreMatch,0,0.0,0,FormationSet,Successful,...,,,"[{""Captainplayerid"": ""337629""}, {""Playerpositi...",[],False,,,,,


## Event data: Liga Portugal 23/24


In [None]:
scrape_and_clean_whoscored_event_data(league_name="PRT-Liga Portugal", season=2023)

## Event data: Liga Portugal 22/23


In [10]:
scrape_and_clean_whoscored_event_data(league_name="PRT-Liga Portugal", season=2022)





Processing: 100%|██████████| 7/7 [55:01<00:00, 471.71s/step]


Unnamed: 0,league,season,game,game_id,period,minute,second,expanded_minute,type,outcome_type,...,blocked_x,blocked_y,qualifiers,satisfied_events_types,is_touch,is_shot,is_goal,card_type,related_event_id,related_player_id
0,PRT-Liga Portugal,2223,2022-08-05 Benfica-Arouca,1659177,FirstHalf,0,0.0,0,Start,Successful,...,,,[],[],False,,,,,
1,PRT-Liga Portugal,2223,2022-08-05 Benfica-Arouca,1659177,FirstHalf,0,0.0,0,Start,Successful,...,,,[],[],False,,,,,
2,PRT-Liga Portugal,2223,2022-08-05 Benfica-Arouca,1659177,FirstHalf,0,0.0,0,Pass,Successful,...,,,"[{""Standingsave"": true}, {""Length"": ""21.5""}, {...","[""touches"", ""passAccurate"", ""shortPassAccurate...",True,,,,,
3,PRT-Liga Portugal,2223,2022-08-05 Benfica-Arouca,1659177,FirstHalf,0,1.0,0,Pass,Successful,...,,,"[{""Passendy"": ""7.0""}, {""Length"": ""21.5""}, {""Pa...","[""touches"", ""passAccurate"", ""shortPassAccurate...",True,,,,,
4,PRT-Liga Portugal,2223,2022-08-05 Benfica-Arouca,1659177,FirstHalf,0,4.0,0,Pass,Successful,...,,,"[{""Standingsave"": true}, {""Passendy"": ""4.6""}, ...","[""touches"", ""passAccurate"", ""shortPassAccurate...",True,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467931,PRT-Liga Portugal,2223,2023-05-27 Portimonense-Arouca,1659469,SecondHalf,92,53.0,93,End,Successful,...,,,[],[],False,,,,,
467932,PRT-Liga Portugal,2223,2023-05-27 Portimonense-Arouca,1659469,PostGame,0,0.0,3,End,Successful,...,,,[],[],False,,,,,
467933,PRT-Liga Portugal,2223,2023-05-27 Portimonense-Arouca,1659469,PostGame,0,0.0,3,End,Successful,...,,,[],[],False,,,,,
467934,PRT-Liga Portugal,2223,2023-05-27 Portimonense-Arouca,1659469,PreMatch,0,0.0,0,FormationSet,Successful,...,,,"[{""Captainplayerid"": ""326486""}, {""Teamplayerfo...",[],False,,,,,


`To be Continued`