In [None]:
# pip install pandas numpy statsbombpy matplotlib seaborn plotly networkx notebook

import pandas as pd
from statsbombpy import sb

# --------------------------------------------
# Configuration
# --------------------------------------------
OUTPUT_DIR = r"E:\MSc Big Data Analytics in Football\WSL project\Statsbombpy data"
MATCH_FILE = "wsl_matches_all.csv"
EVENT_FILE = "wsl_events_all.csv"

# Define WSL seasons to fetch
WSL_SEASONS = [
    {"competition_id": 37, "season_id": 4, "season_name": "2018/2019"},
    {"competition_id": 37, "season_id": 42, "season_name": "2019/2020"},
    {"competition_id": 37, "season_id": 90, "season_name": "2020/2021"},
    {"competition_id": 1238, "season_id": 108, "season_name": "2021/2022"},
]

# --------------------------------------------
# Functions
# --------------------------------------------
def find_wsl_competitions():
    """Find all WSL competitions from StatsBomb data."""
    comps = sb.competitions()
    wsl_comps = comps[comps['competition_name'].str.contains("Super League", case=False)]
    print("WSL Competitions Found:")
    print(wsl_comps[['competition_id', 'season_id', 'season_name']])
    return wsl_comps

def prepare_events(match_id):
    """Prepare event data for a given match ID."""
    df = sb.events(match_id=match_id)
    cols = [
        'id', 'type', 'team', 'player', 'minute', 'second',
        'location', 'pass_end_location', 'shot_statsbomb_xg',
        'shot_outcome', 'carry_end_location', 'possession', 'possession_team'
    ]
    df_clean = df[cols].copy()
    df_clean['match_id'] = match_id
    return df_clean

def collect_season_data():
    """Fetch match and event data for all defined WSL seasons."""
    all_matches = []
    all_events = []

    for season in WSL_SEASONS:
        cid, sid, sname = season["competition_id"], season["season_id"], season["season_name"]
        print(f"\nFetching WSL {sname} (competition_id={cid}, season_id={sid}) ...")
        matches = sb.matches(competition_id=cid, season_id=sid)
        print(f" → {matches.shape[0]} matches loaded")
        matches['season_name'] = sname
        all_matches.append(matches)

        for mid in matches.match_id:
            ev = prepare_events(mid)
            ev['season_name'] = sname
            all_events.append(ev)

    return all_matches, all_events

# --------------------------------------------
# Main Execution
# --------------------------------------------
if __name__ == "__main__":
    # Step 1: Find WSL competitions
    find_wsl_competitions()

    # Step 2: Collect match and event data
    all_matches, all_events = collect_season_data()

    # Step 3: Concatenate data
    matches_df = pd.concat(all_matches, ignore_index=True)
    events_df = pd.concat(all_events, ignore_index=True)
    print(f"\n✅ Total Matches Collected: {matches_df.shape[0]}")
    print(f"✅ Total Events Collected: {events_df.shape[0]}")

    # Step 4: Save to CSV
    matches_df.to_csv(os.path.join(OUTPUT_DIR, MATCH_FILE), index=False)
    events_df.to_csv(os.path.join(OUTPUT_DIR, EVENT_FILE), index=False)
    print(f"\n💾 Saved: '{MATCH_FILE}' and '{EVENT_FILE}'")

In [None]:
# pip install openpyxl

import pandas as pd
import os

# --------------------------------------------
# Configuration
# --------------------------------------------
BASE_DIR = r"E:\MSc Big Data Analytics in Football\WSL project\FbRef"
OUTPUT_DIR = os.path.join(BASE_DIR, "processed")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Define seasons
SEASONS = {
    "18-19": "2018/2019",
    "19-20": "2019/2020",
    "20-21": "2020/2021",
    "21-22": "2021/2022"
}

# File categories
TEAM_FILES = ["Team Stats", "Team GK", "Team Shooting", "Team Passing", "Team Possession", "Team Def Actions"]
PLAYER_FILES = ["Player Stats", "Player GK", "Player Shooting", "Player Passing", "Player Possession", "Player Def Actions"]

# --------------------------------------------
# Functions
# --------------------------------------------
def load_fbref_data(season_folder, season_label, file_list, level="team"):
    """Load and tag FbRef CSVs for one season."""
    dfs = []
    for fname in file_list:
        path = os.path.join(BASE_DIR, season_folder, f"{fname}.xlsx")
        if os.path.exists(path):
            df = pd.read_excel(path)
            df["season"] = season_label
            df["stat_category"] = fname
            df["level"] = level
            dfs.append(df)
        else:
            print(f"⚠️ Missing file: {path}")
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

# --------------------------------------------
# Main Execution
# --------------------------------------------
if __name__ == "__main__":
    # Step 1: Collect team and player data
    team_dfs = []
    player_dfs = []
    for folder, label in SEASONS.items():
        print(f"📂 Processing {folder} ({label}) ...")
        team_dfs.append(load_fbref_data(folder, label, TEAM_FILES, level="team"))
        player_dfs.append(load_fbref_data(folder, label, PLAYER_FILES, level="player"))

    # Step 2: Concatenate data
    team_all = pd.concat(team_dfs, ignore_index=True)
    player_all = pd.concat(player_dfs, ignore_index=True)

    # Step 3: Save to CSV
    team_all.to_csv(os.path.join(OUTPUT_DIR, "fbref_team_all.csv"), index=False)
    player_all.to_csv(os.path.join(OUTPUT_DIR, "fbref_player_all.csv"), index=False)
    print("✅ FbRef consolidation complete!")
    print(f"Team dataset: {team_all.shape}")
    print(f"Player dataset: {player_all.shape}")

In [None]:
import pandas as pd
import os

# --------------------------------------------
# Configuration
# --------------------------------------------
BASE_DIR = r"E:\MSc Big Data Analytics in Football\WSL project\Statsbombpy data"
MATCH_FILE = os.path.join(BASE_DIR, "wsl_matches_all.csv")
EVENT_FILE = os.path.join(BASE_DIR, "wsl_events_all.csv")
OUTPUT_FILE = os.path.join(BASE_DIR, "wsl_events_with_gamestate.csv")

# --------------------------------------------
# Functions
# --------------------------------------------
def load_data():
    """Load match and event data."""
    matches_df = pd.read_csv(MATCH_FILE)
    events_df = pd.read_csv(EVENT_FILE)
    print("Matches:", matches_df.shape)
    print("Events:", events_df.shape)
    return matches_df, events_df

def build_match_info(matches_df):
    """Create dictionary of match_id to home/away teams."""
    match_info = {}
    for _, row in matches_df.iterrows():
        match_info[row["match_id"]] = {
            "home": row["home_team"],
            "away": row["away_team"],
        }
    return match_info

def add_game_state(events_df, match_info):
    """Add game state to events based on current score."""
    events_df = events_df.sort_values(by=["match_id", "minute", "second"]).reset_index(drop=True)
    game_states = []
    score_home, score_away = 0, 0
    current_match = None

    for _, ev in events_df.iterrows():
        match_id = ev["match_id"]
        # Reset scores for new match
        if match_id != current_match:
            current_match = match_id
            score_home, score_away = 0, 0
        home = match_info[match_id]["home"]
        away = match_info[match_id]["away"]
        team = ev["team"]
        event_type = ev["type"]

        # Update score if goal
        if event_type == "Shot" and ev.get("shot_outcome") == "Goal":
            if team == home:
                score_home += 1
            elif team == away:
                score_away += 1

        # Assign game state
        if team == home:
            if score_home > score_away:
                game_state = "winning"
            elif score_home < score_away:
                game_state = "losing"
            else:
                game_state = "drawing"
        elif team == away:
            if score_away > score_home:
                game_state = "winning"
            elif score_away < score_home:
                game_state = "losing"
            else:
                game_state = "drawing"
        else:
            game_state = None
        game_states.append(game_state)

    events_df["game_state"] = game_states
    return events_df

# --------------------------------------------
# Main Execution
# --------------------------------------------
if __name__ == "__main__":
    # Step 1: Load data
    matches_df, events_df = load_data()

    # Step 2: Build match info
    match_info = build_match_info(matches_df)

    # Step 3: Add game state
    events_df = add_game_state(events_df, match_info)

    # Step 4: Save output
    events_df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
    print("✅ Game state labeling complete!")
    print("Saved file:", OUTPUT_FILE)
    print("Shape:", events_df.shape)
    print(events_df.head())