In [1]:
pip install pandas numpy statsbombpy matplotlib seaborn plotly networkx notebook openpyxl tqdm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
import logging
import warnings
from statsbombpy import sb
from tqdm import tqdm
import time

In [9]:
# --------------------------------------------
# Configuration
# --------------------------------------------
# Base directories
STATSBASE_DIR = r"E:\MSc Big Data Analytics in Football\WSL project\Statsbombpy data"
FBREF_BASE_DIR = r"E:\MSc Big Data Analytics in Football\WSL project\Data\FbRef"

# StatsBomb output files
STATS_OUT_DIR = STATSBASE_DIR
MATCH_FILE = os.path.join(STATS_OUT_DIR, "wsl_matches_all.csv")
EVENT_FILE = os.path.join(STATS_OUT_DIR, "wsl_events_all.csv")
GAMESTATE_FILE = os.path.join(STATS_OUT_DIR, "wsl_events_with_gamestate.csv")

# Logging configuration
LOG_FILE = os.path.join(STATS_OUT_DIR, "data_processing.log")

# Test mode: Set to False to process all seasons
TEST_MODE = False

# Set working directory for Jupyter Notebook
try:
    os.chdir(STATS_OUT_DIR)
except Exception as e:
    print(f"Error setting working directory to {STATS_OUT_DIR}: {e}")
    exit(1)

# Create output directories upfront to prevent FileNotFoundError
try:
    os.makedirs(STATS_OUT_DIR, exist_ok=True)
    os.makedirs(FBREF_OUT_DIR, exist_ok=True)
except Exception as e:
    print(f"Error creating directories: {e}")
    exit(1)

In [10]:
# --------------------------------------------
# Logging Setup
# --------------------------------------------
try:
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(LOG_FILE, encoding='utf-8'),
            logging.StreamHandler()
        ]
    )
except Exception as e:
    print(f"Error setting up logging: {e}. Falling back to console-only logging.")
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[logging.StreamHandler()]
    )
logger = logging.getLogger(__name__)

In [11]:
# --------------------------------------------
# StatsBomb Data Collection
# --------------------------------------------
def find_wsl_competitions():
    """Find all FA Women's Super League competitions from StatsBomb data."""
    try:
        comps = sb.competitions()
        wsl_comps = comps[comps['competition_name'].str.contains("Women's Super League", case=False)]
        if wsl_comps.empty:
            logger.warning("No WSL competitions found.")
            return pd.DataFrame()
        logger.info("WSL Competitions Found:")
        logger.info("\n" + wsl_comps[['competition_id', 'season_id', 'season_name']].to_string())
        return wsl_comps
    except Exception as e:
        logger.error(f"Error fetching competitions: {e}")
        return pd.DataFrame()

def prepare_events(match_id):
    """Prepare event data for a given match ID with retry logic."""
    for attempt in range(3):  # Retry up to 3 times
        try:
            df = sb.events(match_id=match_id)
            cols = [
                'id', 'type', 'team', 'player', 'minute', 'second',
                'location', 'pass_end_location', 'shot_statsbomb_xg',
                'shot_outcome', 'carry_end_location', 'possession', 'possession_team'
            ]
            missing_cols = [col for col in cols if col not in df.columns]
            if missing_cols:
                logger.warning(f"Match {match_id}: Missing columns {missing_cols}")
            df_clean = df[cols].copy() if not missing_cols else df[[col for col in cols if col in df.columns]].copy()
            df_clean['match_id'] = match_id
            return df_clean
        except Exception as e:
            logger.warning(f"Attempt {attempt + 1} failed for match {match_id}: {e}")
            if attempt < 2:
                time.sleep(1)  # Wait 1 second before retrying
            else:
                logger.error(f"Failed to fetch events for match {match_id} after 3 attempts: {e}")
                return pd.DataFrame()

def collect_season_data():
    """Fetch match and event data for all WSL seasons."""
    wsl_comps = find_wsl_competitions()
    if wsl_comps.empty:
        logger.error("No WSL competitions to process.")
        return [], []

    if TEST_MODE:
        wsl_comps = wsl_comps[wsl_comps['season_id'] == 4]  # Test with 2018/2019 only
        logger.info("TEST MODE: Processing only 2018/2019 season.")

    all_matches = []
    all_events = []

    for _, season in wsl_comps.iterrows():
        cid, sid, sname = season["competition_id"], season["season_id"], season["season_name"]
        logger.info(f"Fetching WSL {sname} (competition_id={cid}, season_id={sid}) ...")
        try:
            matches = sb.matches(competition_id=cid, season_id=sid)
            if matches.empty:
                logger.warning(f"No matches found for season {sname}")
                continue
            logger.info(f" -> {matches.shape[0]} matches loaded")
            matches['season_name'] = sname
            all_matches.append(matches)

            for mid in tqdm(matches.match_id, desc=f"Processing events for {sname}", leave=False):
                ev = prepare_events(mid)
                if not ev.empty:
                    ev['season_name'] = sname
                    all_events.append(ev)
        except Exception as e:
            logger.error(f"Error processing season {sname}: {e}")

    return all_matches, all_events

In [13]:
# --------------------------------------------
# Game State Addition
# --------------------------------------------
def load_data():
    """Load match and event data."""
    try:
        matches_df = pd.read_csv(MATCH_FILE)
        events_df = pd.read_csv(EVENT_FILE)
        logger.info(f"Matches: {matches_df.shape}")
        logger.info(f"Events: {events_df.shape}")
        # Validate required columns
        required_match_cols = ["match_id", "home_team", "away_team"]
        required_event_cols = ["match_id", "team", "type", "minute", "second", "shot_outcome"]
        missing_match_cols = [col for col in required_match_cols if col not in matches_df.columns]
        missing_event_cols = [col for col in required_event_cols if col not in events_df.columns]
        if missing_match_cols or missing_event_cols:
            logger.error(f"Missing columns in matches: {missing_match_cols}, events: {missing_event_cols}")
            return pd.DataFrame(), pd.DataFrame()
        return matches_df, events_df
    except Exception as e:
        logger.error(f"Error loading data: {e}")
        return pd.DataFrame(), pd.DataFrame()

def build_match_info(matches_df):
    """Create dictionary of match_id to home/away teams."""
    match_info = {}
    try:
        for _, row in matches_df.iterrows():
            match_info[row["match_id"]] = {
                "home": row["home_team"],
                "away": row["away_team"],
            }
        return match_info
    except Exception as e:
        logger.error(f"Error building match info: {e}")
        return {}

def add_game_state(events_df, match_info):
    """Add game state to events based on current score."""
    if events_df.empty or not match_info:
        logger.error("Empty events DataFrame or match info.")
        return pd.DataFrame()

    try:
        events_df = events_df.sort_values(by=["match_id", "minute", "second"]).reset_index(drop=True)
        game_states = []
        score_home, score_away = 0, 0
        current_match = None

        for _, ev in events_df.iterrows():
            match_id = ev["match_id"]
            # Reset scores for new match
            if match_id != current_match:
                current_match = match_id
                score_home, score_away = 0, 0
            if match_id not in match_info:
                game_states.append(None)
                continue
            home = match_info[match_id]["home"]
            away = match_info[match_id]["away"]
            team = ev["team"]
            event_type = ev["type"]

            # Update score for goals (including own goals)
            if event_type == "Shot" and ev.get("shot_outcome") == "Goal":
                if team == home:
                    score_home += 1
                elif team == away:
                    score_away += 1
                elif ev.get("shot_type") == "Own Goal":
                    # Own goal: credit to opposing team
                    if team == home:
                        score_away += 1
                    elif team == away:
                        score_home += 1

            # Assign game state
            if team == home:
                if score_home > score_away:
                    game_state = "winning"
                elif score_home < score_away:
                    game_state = "losing"
                else:
                    game_state = "drawing"
            elif team == away:
                if score_away > score_home:
                    game_state = "winning"
                elif score_away < score_home:
                    game_state = "losing"
                else:
                    game_state = "drawing"
            else:
                game_state = None
            game_states.append(game_state)

        events_df["game_state"] = game_states
        return events_df
    except Exception as e:
        logger.error(f"Error adding game state: {e}")
        return pd.DataFrame()

In [14]:
# --------------------------------------------
# Main Execution
# --------------------------------------------
if __name__ == "__main__":
    # Step 1: StatsBomb Data Collection
    logger.info("Starting StatsBomb data collection...")
    all_matches, all_events = collect_season_data()
    matches_df = pd.concat(all_matches, ignore_index=True) if all_matches else pd.DataFrame()
    events_df = pd.concat(all_events, ignore_index=True) if all_events else pd.DataFrame()

    if not matches_df.empty:
        logger.info(f"Total Matches Collected: {matches_df.shape[0]}")
        if 'competition_id' in matches_df.columns:
            logger.info(f"Competition IDs: {matches_df['competition_id'].unique()}")
    else:
        logger.warning("No match data collected.")
    if not events_df.empty:
        logger.info(f"Total Events Collected: {events_df.shape[0]}")
    else:
        logger.warning("No event data collected.")

    try:
        matches_df.to_csv(MATCH_FILE, index=False)
        events_df.to_csv(EVENT_FILE, index=False)
        logger.info(f"Saved: '{MATCH_FILE}' and '{EVENT_FILE}'")
    except Exception as e:
        logger.error(f"Error saving StatsBomb files: {e}")

    # Step 2: Game State Addition
    logger.info("Starting game state addition...")
    matches_df, events_df = load_data()
    if matches_df.empty or events_df.empty:
        logger.error("Failed to load data. Exiting.")
        exit(1)

    match_info = build_match_info(matches_df)
    if not match_info:
        logger.error("Failed to build match info. Exiting.")
        exit(1)

    events_df = add_game_state(events_df, match_info)
    if events_df.empty:
        logger.error("Failed to add game state. Exiting.")
        exit(1)

    try:
        events_df.to_csv(GAMESTATE_FILE, index=False, encoding="utf-8")
        logger.info(f"Game state labeling complete! Saved file: {GAMESTATE_FILE}")
        logger.info(f"Shape: {events_df.shape}")
        logger.info("\n" + events_df.head().to_string())
    except Exception as e:
        logger.error(f"Error saving game state file: {e}")

2025-08-27 05:16:55,029 - INFO - Starting StatsBomb data collection...
2025-08-27 05:16:55,049 - INFO - WSL Competitions Found:
2025-08-27 05:16:55,053 - INFO - 
    competition_id  season_id season_name
25              37         90   2020/2021
26              37         42   2019/2020
27              37          4   2018/2019
2025-08-27 05:16:55,055 - INFO - Fetching WSL 2020/2021 (competition_id=37, season_id=90) ...
2025-08-27 05:16:55,077 - INFO -  -> 131 matches loaded








2025-08-27 05:19:05,543 - INFO - Fetching WSL 2019/2020 (competition_id=37, season_id=42) ...                          
2025-08-27 05:19:05,563 - INFO -  -> 87 matches loaded




2025-08-27 05:20:23,794 - INFO - Fetching WSL 2018/2019 (competition_id=37, season_id=4) ...                           
2025-08-27 05:20:23,817 - INFO -  -> 108 matches loaded






2025-08-27 05:21:50,930 - INFO - Total Matches Collected: 326                                                          
2025-08-27 05:21:50,934 - INFO - Total Events Collected: 1095921
2025-08-27 05:22:02,509 - INFO - Saved: 'E:\MSc Big Data Analytics in Football\WSL project\Statsbombpy data\wsl_matches_all.csv' and 'E:\MSc Big Data Analytics in Football\WSL project\Statsbombpy data\wsl_events_all.csv'
2025-08-27 05:22:02,511 - INFO - Starting FbRef data consolidation...
2025-08-27 05:22:02,521 - INFO - Processing 18-19 (2018/2019) ...
2025-08-27 05:22:04,268 - INFO - Processing 19-20 (2019/2020) ...
2025-08-27 05:22:05,931 - INFO - Processing 20-21 (2020/2021) ...
2025-08-27 05:22:08,198 - INFO - Processing 21-22 (2021/2022) ...
2025-08-27 05:22:10,113 - INFO - Team dataset: (307, 53)
2025-08-27 05:22:10,115 - INFO - Player dataset: (5511, 58)
2025-08-27 05:22:10,245 - INFO - FbRef consolidation complete!
2025-08-27 05:22:10,247 - INFO - Starting game state addition...
2025-08-27 05:2

In [15]:
# Define file paths
statsbomb_dir = r"E:\MSc Big Data Analytics in Football\WSL project\Statsbombpy data"
fbref_dir = r"E:\MSc Big Data Analytics in Football\WSL project\Data\FbRef\processed"

# Load and display head of StatsBomb files
print("Head of wsl_matches_all.csv:")
matches_df = pd.read_csv(os.path.join(statsbomb_dir, "wsl_matches_all.csv"))
print(matches_df.head())

print("\nHead of wsl_events_all.csv:")
events_df = pd.read_csv(os.path.join(statsbomb_dir, "wsl_events_all.csv"))
print(events_df.head())

print("\nHead of wsl_events_with_gamestate.csv:")
gamestate_df = pd.read_csv(os.path.join(statsbomb_dir, "wsl_events_with_gamestate.csv"))
print(gamestate_df.head())

# Load and display head of FbRef files
print("\nHead of fbref_team_all.csv:")
team_df = pd.read_csv(os.path.join(fbref_dir, "fbref_team_all.csv"))
print(team_df.head())

print("\nHead of fbref_player_all.csv:")
player_df = pd.read_csv(os.path.join(fbref_dir, "fbref_player_all.csv"))
print(player_df.head())

Head of wsl_matches_all.csv:
   match_id  match_date      kick_off                        competition  \
0   3775648  2021-02-28  15:00:00.000  England - FA Women's Super League   
1   3775609  2021-04-28  20:30:00.000  England - FA Women's Super League   
2   3775633  2021-02-06  13:30:00.000  England - FA Women's Super League   
3   3775570  2021-03-28  13:30:00.000  England - FA Women's Super League   
4   3775581  2021-03-28  15:30:00.000  England - FA Women's Super League   

      season                   home_team                away_team  home_score  \
0  2020/2021                 Aston Villa              Arsenal WFC           0   
1  2020/2021                 Arsenal WFC      West Ham United LFC           2   
2  2020/2021                 Aston Villa  Tottenham Hotspur Women           1   
3  2020/2021  Brighton & Hove Albion WFC              Everton LFC           0   
4  2020/2021                 Chelsea FCW              Aston Villa           2   

   away_score match_status 