In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from io import StringIO
from pathlib import Path
import warnings

In [2]:
current_dir = Path.cwd()  
path_chrome = current_dir.parent.parent.parent.parent / "chromedriver2" / "chromedriver"

In [3]:
season_code = "25_26"

In [4]:
url_serie_a        = "https://fbref.com/en/comps/11/schedule/Serie-A-Scores-and-Fixtures"
url_ligue_1        = "https://fbref.com/en/comps/13/schedule/Ligue-1-Scores-and-Fixtures"
url_bundesliga     = "https://fbref.com/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures"
url_premier_league = "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures"
url_la_liga        = "https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures"
url_ldc            = "https://fbref.com/en/comps/8/schedule/Champions-League-Scores-and-Fixtures"
url_el             = "https://fbref.com/en/comps/19/schedule/Europa-League-Scores-and-Fixtures"
url_ecl            = "https://fbref.com/en/comps/882/schedule/Conference-League-Scores-and-Fixtures"

url_leagues     = [url_serie_a, url_ligue_1, url_bundesliga, url_premier_league, url_la_liga, url_ldc, url_el, url_ecl]
url_id_tab_list = ["sched_2025-2026_11_1", "sched_2025-2026_13_1", "sched_2025-2026_20_1", "sched_2025-2026_9_1", "sched_2025-2026_12_1", "sched_all", "sched_all", "sched_all"]

leagues         = ["Italian Serie A", "French Ligue 1", "German Bundesliga", "English Premier League", "Spanish La Liga", "UEFA Champions League", "UEFA Europa League", "UEFA Europa Conference League"]

In [5]:
current_dir = Path.cwd() 
path_folder_bis = current_dir.parent.parent.parent / "csv" / f"csv{season_code}"
path_folder = current_dir.parent.parent / "csv" / f"csv{season_code}"

In [6]:
def new_gameweek(gameweek, count):
    if gameweek == 'Knockout phase play-offs' and count <= 8:
        gameweek = 'J9'
        count += 1
    elif gameweek == 'Knockout phase play-offs' and count <= 16:
        gameweek = 'J10'
        count += 1
    elif gameweek == 'Round of 16' and count <= 24:
        gameweek = 'J11'
        count += 1
    elif gameweek == 'Round of 16' and count <= 32:
        gameweek = 'J12'
        count += 1
    elif gameweek == 'Quarter-finals' and count <= 36:
        gameweek = 'J13'
        count += 1
    elif gameweek == 'Quarter-finals' and count <= 40:
        gameweek = 'J14'
        count += 1
    elif gameweek == 'Semi-finals' and count <= 42:
        gameweek = 'J15'
        count += 1
    elif gameweek == 'Semi-finals' and count <= 44:
        gameweek = 'J16'
        count += 1
    if gameweek == 'Final':
        gameweek = 'J17'
        count += 1
    return gameweek, count

In [7]:
def data_game(url_game, game_week, home_team, away_team, league, path_chrome):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/17.0"
    )
    service = Service(path_chrome)
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url_game)
    time.sleep(random.uniform(5, 7))
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    html = driver.page_source
    driver.quit()
    soup = BeautifulSoup(html, "html.parser")
    
    tables = soup.find_all("table")

    dfs = []
    for table in tables:
        df = pd.read_html(StringIO(str(table)), header=[0,1])[0]
        dfs.append(df)

    players_home = pd.concat(dfs[3:9], axis=1)[:-1]
    players_away = pd.concat(dfs[10:16], axis=1)[:-1]

    for df, team in zip([players_home, players_away], [home_team, away_team]):
        df.insert(0, ("", "Game Week"), game_week)
        df.insert(2, ("", "Team"), team)
        df.insert(3, ("", "League"), league)
        
    data_players = pd.concat([players_home, players_away], axis=0)
    data_players.reset_index(drop=True, inplace=True)
    if isinstance(data_players.columns, pd.MultiIndex):
        data_players.columns = data_players.columns.droplevel(0)

    goals_home = dfs[9]
    goals_away = dfs[16]
    for df, team in zip([goals_home, goals_away], [home_team, away_team]):
        df.insert(0, ("", "Game Week"), game_week)
        df.insert(2, ("", "Team"), team)
        df.insert(3, ("", "League"), league)

    data_goals = pd.concat([goals_home, goals_away], axis=0)
    data_goals.reset_index(drop=True, inplace=True)
    if isinstance(data_goals.columns, pd.MultiIndex):
        data_goals.columns = data_goals.columns.droplevel(0)

    return data_players, data_goals

In [8]:
def stats_team(url, team1, team2, gameweek, league, path_chrome):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/17.0"
    )
    service = Service(path_chrome)
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    time.sleep(random.uniform(5, 7))
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    html = driver.page_source
    driver.quit()
    soup = BeautifulSoup(html, "html.parser")
    
    tables = soup.find_all("table")
    df = pd.read_html(StringIO(str(tables[2])))[0]  
    stats = []
    team1_vals = []
    team2_vals = []
    current_stat = "Possession"

    for i in range(len(df)):
        b = df.iloc[i,0]
        l = df.iloc[i,1]
    
        if (b is not None and ('%' not in str(b) and '—' not in str(b))) or \
            (l is not None and ('%' not in str(l) and '—' not in str(l))):
            current_stat = b if b is not None else l
        else:
            stats.append(current_stat)
            team1_vals.append(b)
            team2_vals.append(l)

    df_clean = pd.DataFrame({
        team1: team1_vals,
        team2: team2_vals
    }, index=stats)

    df_clean = df_clean.T
    df_clean.insert(0, 'Game Week', [gameweek]*len(df_clean))
    df_clean.insert(0, 'League', [league]*len(df_clean))
    df_clean.index.name = 'Team'
    
    return df_clean

In [9]:
def url_games(url_league, url_id_table, path_chrome):
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                         "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15")

    service = Service(path_chrome)
    driver = webdriver.Chrome(service=service, options=options)

    driver.get(url_league)
    time.sleep(random.uniform(6, 10))

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", id=url_id_table)

    data_games_list = []
    rows = table.find_all("tr")
    count = 1
    
    for row in rows:
        if url_id_table != "sched_all":
            game_week = row.find("th", {"data-stat": "gameweek"})
            date = row.find("td", {"data-stat": "date"})
            home = row.find("td", {"data-stat": "home_team"})
            away = row.find("td", {"data-stat": "away_team"})
            score = row.find("td", {"data-stat": "score"})
            attendance = row.find("td", {"data-stat": "attendance"})
            venue = row.find("td", {"data-stat": "venue"})
            referee = row.find("td", {"data-stat": "referee"})
            report = row.find("td", {"data-stat": "match_report"})
            gameweek = "J" + game_week.text.strip() if game_week else None
            
            if not home or not away:
                continue
            if not home.text.strip() or not away.text.strip():
                continue
            home = home.text.strip()
            away = away.text.strip()

        else:
            week = row.find("th", {"data-stat": "round"})
            if week.text.strip() == "League phase":
                week = row.find("td", {"data-stat": "gameweek"})
                gameweek = "J" + week.text.strip() 
            else:
                gameweek = week.text.strip()
            date = row.find("td", {"data-stat": "date"})
            home = row.find("td", {"data-stat": "home_team"})
            if home:
                home_team_text = home.text.strip()
                home_team_words = home_team_text.split()
                home = " ".join(home_team_words[:-1]) if len(home_team_words) > 1 else home_team_text
            away = row.find("td", {"data-stat": "away_team"})
            if away:
                away_team_text = away.text.strip()
                away_team_words = away_team_text.split()
                away = " ".join(away_team_words[1:]) if len(away_team_words) > 1 else away_team_text
            score = row.find("td", {"data-stat": "score"})
            attendance = row.find("td", {"data-stat": "attendance"})
            venue = row.find("td", {"data-stat": "venue"})
            referee = row.find("td", {"data-stat": "referee"})
            report = row.find("td", {"data-stat": "match_report"})
            gameweek, count = new_gameweek(gameweek, count)
            
            if not home or not away:
                continue
        
        link = None  
        if report:
            a = report.find("a")
            if a and a.has_attr("href"):
                link = "https://fbref.com" + a["href"]

        data_games_list.append({
            "Game Week": gameweek,
            "Home Team": home,
            "Away Team": away,
            "Score": score.text.strip() if score else None,
            "Attendance": attendance.text.strip() if attendance else None,
            "Venue": venue.text.strip() if venue else None,
            "Referee": referee.text.strip() if referee else None,
            "Date": date.text.strip() if date else None,
            "URL": link
        })

    driver.quit()
    return data_games_list

In [10]:
new_column_names_players = [
    "Game Week",  # Semaine de jeu
    "Player",  # Nom du joueur
    "Team",  # Équipe
    "League",  # Ligue
    "Shirt Number",  # Numéro de maillot
    "Nationality",  # Nationalité
    "Position",  # Poste
    "Age",  # Âge
    "Minutes",  # Minutes jouées
    "Goals",  # Buts
    "Assists",  # Passes décisives
    "Penalty Kicks Made",  # Pénaltys marqués
    "Penalty Kicks Attempted",  # Pénaltys tentés
    "Shots Total",  # Tirs totaux (hors pénaltys)
    "Shots on Target",  # Tirs cadrés
    "Yellow Cards",  # Cartons jaunes
    "Red Cards",  # Cartons rouges
    "Touches",  # Touches de balle
    "Tackles",  # Tacles réussis
    "Interceptions",  # Interceptions
    "Blocks",  # Contres
    "Expected Goals (xG)",  # Buts attendus (xG)
    "Non-Penalty Expected Goals (npxG)",  # xG hors pénaltys
    "Expected Assisted Goals (xAG)",  # xA - passes décisives attendues
    "Shot-Creating Actions (SCA)",  # Actions menant à un tir
    "Goal-Creating Actions (GCA)",  # Actions menant à un but
    "Passes Completed",  # Passes réussies
    "Passes Attempted",  # Passes tentées
    "Pass Completion %",  # % de réussite des passes
    "Progressive Passes",  # Passes progressives
    "Carries",  # Conduites de balle
    "Progressive Carries",  # Conduites de balle progressives
    "Take-Ons Attempted",  # Dribbles tentés
    "Successful Take-Ons",  # Dribbles réussis
    
    "Player",  # Nom du joueur
    "Shirt Number",  # Numéro de maillot
    "Nationality",  # Nationalité
    "Position",  # Poste
    "Age",  # Âge
    "Minutes Played",  # Minutes jouées
    "Passes Completed (Total)",  # Passes réussies - total
    "Passes Attempted (Total)",  # Passes tentées - total
    "Pass Completion % (Total)",  # % de réussite - total
    "Total Passing Distance",  # Distance totale parcourue par les passes
    "Progressive Passing Distance",  # Distance progressive des passes
    "Passes Completed (Short)",  # Passes réussies - courtes
    "Passes Attempted (Short)",  # Passes tentées - courtes
    "Pass Completion % (Short)",  # % de réussite - courtes
    "Passes Completed (Medium)",  # Passes réussies - moyennes
    "Passes Attempted (Medium)",  # Passes tentées - moyennes
    "Pass Completion % (Medium)",  # % de réussite - moyennes
    "Passes Completed (Long)",  # Passes réussies - longues
    "Passes Attempted (Long)",  # Passes tentées - longues
    "Pass Completion % (Long)",  # % de réussite - longues
    "Assists",  # Passes décisives
    "Expected Assisted Goals (xAG)",  # xA généré par les passes
    "Expected Assists (xA)",  # xA global
    "Key Passes",  # Passes clés (menant à un tir)
    "Passes into Final Third",  # Passes dans le dernier tiers
    "Passes into Penalty Area",  # Passes dans la surface
    "Crosses into Penalty Area",  # Centres dans la surface
    "Progressive Passes",  # Passes progressives
    
    "Player",  # Nom du joueur
    "Shirt Number",  # Numéro de maillot
    "Nationality",  # Nationalité
    "Position",  # Poste
    "Age",  # Âge
    "Minutes Played",  # Minutes jouées
    "Passes Attempted",  # Passes tentées
    "Live-ball Passes",  # Passes en jeu
    "Dead-ball Passes",  # Passes arrêtées
    "Passes from Free Kicks",  # Passes sur coup franc
    "Through Balls",  # Passes en profondeur
    "Switches",  # Changements d’aile
    "Crosses",  # Centres
    "Throw-ins Taken",  # Touches jouées
    "Corner Kicks",  # Corners totaux
    "Inswinging Corner Kicks",  # Corners rentrants
    "Outswinging Corner Kicks",  # Corners sortants
    "Straight Corner Kicks",  # Corners droits
    "Passes Completed",  # Passes réussies
    "Passes Offside",  # Passes provoquant un hors-jeu
    "Passes Blocked",  # Passes contrées
    
    "Player",  # Nom du joueur
    "Shirt Number",  # Numéro de maillot
    "Nationality",  # Nationalité
    "Position",  # Poste
    "Age",  # Âge
    "Minutes Played",  # Minutes jouées
    "Tackles",  # Tacles total
    "Tackles Won",  # Tacles gagnés (avec récupération)
    "Tackles in Defensive Third",  # Tacles dans le tiers défensif
    "Tackles in Middle Third",  # Tacles dans le tiers du milieu
    "Tackles in Attacking Third",  # Tacles dans le tiers offensif
    "Dribblers Tackled",  # Dribbleurs taclés
    "Dribbles Challenged",  # Dribbles défendus (tacles + ratés)
    "Tackle Success % vs Dribblers",  # % de réussite contre les dribbleurs
    "Challenges Lost",  # Défenses ratées sur dribbleur
    "Blocks",  # Ballons bloqués
    "Shots Blocked",  # Tirs bloqués
    "Passes Blocked",  # Passes bloquées
    "Interceptions",  # Interceptions
    "Tackles + Interceptions",  # Tacles + interceptions
    "Clearances",  # Dégagements
    "Errors Leading to Shot",  # Erreurs menant à un tir
    
    "Player",  # Nom du joueur
    "Shirt Number",  # Numéro de maillot
    "Nationality",  # Nationalité
    "Position",  # Poste
    "Age",  # Âge
    "Minutes Played",  # Minutes jouées
    "Touches",  # Touches totales
    "Touches in Defensive Penalty Area",  # Dans la surface défensive
    "Touches in Defensive Third",  # Dans le tiers défensif
    "Touches in Middle Third",  # Dans le tiers central
    "Touches in Attacking Third",  # Dans le tiers offensif
    "Touches in Attacking Penalty Area",  # Dans la surface offensive
    "Live-Ball Touches",  # Touches en jeu
    "Take-Ons Attempted",  # Dribbles tentés
    "Take-Ons Succeeded",  # Dribbles réussis
    "Take-On Success %",  # % de réussite en dribble
    "Times Tackled During Take-On",  # Tacles subis en dribble
    "Tackled During Take-On %",  # % de dribbles stoppés
    "Carries",  # Conduites de balle
    "Total Carrying Distance",  # Distance totale parcourue avec le ballon
    "Progressive Carrying Distance",  # Distance progressive
    "Progressive Carries",  # Conduites progressives
    "Carries into Final Third",  # Conduites dans le dernier tiers
    "Carries into Penalty Area",  # Conduites dans la surface
    "Miscontrols",  # Contrôles ratés
    "Dispossessed",  # Ballons perdus (dépossédé)
    "Passes Received",  # Passes reçues
    "Progressive Passes Received",  # Passes progressives reçues
    
    "Player",  # Nom du joueur
    "Shirt Number",  # Numéro de maillot
    "Nationality",  # Nationalité
    "Position",  # Poste
    "Age",  # Âge
    "Minutes Played",  # Minutes jouées
    "Yellow Cards",  # Cartons jaunes
    "Red Cards",  # Cartons rouges
    "Second Yellow Card",  # Deuxième jaune
    "Fouls Committed",  # Fautes commises
    "Fouls Drawn",  # Fautes subies
    "Offsides",  # Hors-jeu
    "Crosses",  # Centres
    "Interceptions",  # Interceptions
    "Tackles Won",  # Tacles réussis
    "Penalty Kicks Won",  # Penalties obtenus
    "Penalty Kicks Conceded",  # Penalties concédés
    "Own Goals",  # Buts contre son camp
    "Ball Recoveries",  # Ballons récupérés
    "Aerials Won",  # Duels aériens gagnés
    "Aerials Lost",  # Duels aériens perdus
    "Aerial Win %"  # % de duels aériens gagnés
]


In [11]:
new_column_names_goals = [
    "Game Week",  # Semaine de jeu
    "Player",  # Nom du joueur
    "Team",  # Équipe
    "League",  # Ligue
    "Nationality",  # Nationalité
    "Age",  # Âge
    "Minutes",  # Minutes jouées
    "Shots on Target Against",  # Tirs cadrés subis
    "Goals Against",  # Buts encaissés
    "Saves",  # Arrêts
    "Save Percentage",  # Pourcentage d'arrêts
    "Post-Shot Expected Goals (PSxG)",  # PSxG - xG après tir (qualité des tirs cadrés subis)
    "Passes Completed (Launched)",  # Passes longues réussies (> 40 yards)
    "Passes Attempted (Launched)",  # Passes longues tentées
    "Pass Completion % (Launched)",  # % de passes longues réussies
    "Passes Attempted (GK)",  # Passes tentées (hors goal kicks)
    "Throws Attempted",  # Lancers
    "Launch % (Passes)",  # % de passes longues parmi les passes hors goal kicks
    "Average Pass Length",  # Longueur moyenne des passes
    "Goal Kicks Attempted",  # Dégagements au pied
    "Launch % (Goal Kicks)",  # % de dégagements longs
    "Average Goal Kick Length",  # Longueur moyenne des dégagements
    "Crosses Faced",  # Centres adverses subis
    "Crosses Stopped",  # Centres interceptés
    "Cross Stop %",  # % de centres interceptés
    "Def. Actions Outside Penalty Area",  # Actions défensives hors de la surface
    "Average Distance of Def. Actions"  # Distance moyenne de ces actions
]

In [12]:
def players_column_names(df, new_column_names_players):
    df.columns = new_column_names_players
    return df

In [13]:
def goals_column_names(df, new_column_names_goals):
    df.columns = new_column_names_goals
    return df

In [14]:
extra_matches = {
    "UEFA Champions League": 45,
    "UEFA Europa League": 45,
    "UEFA Europa Conference League": 45
}

no_matches = 0

for i in range(len(url_leagues)):
    url_league = url_leagues[i]
    url_id_table = url_id_tab_list[i]
    league = leagues[i]

    print(f"In {league}:")
    data_games_list = url_games(url_league, url_id_table, path_chrome)
    data_games_df = pd.DataFrame(data_games_list)
    data_games_df = data_games_df.dropna(subset=["Home Team", "Away Team"])

    if league in extra_matches:
        n_extra = extra_matches[league]
        extra_rows = pd.DataFrame([{
            "Home Team": "Home",
            "Away Team": "Away"
        }] * n_extra)
        data_games_df = pd.concat([data_games_df, extra_rows], ignore_index=True)

    print(f"→ {len(data_games_df)} matches found.")
    no_matches += len(data_games_df)

    if len(data_games_df.columns) > 9:
        data_games_df = data_games_df.iloc[:, :9]

    data_games_df.to_csv(
        os.path.join(path_folder, f"Leagues Games/{league}_games.csv"),
        index=False
    )

In Italian Serie A:
→ 380 matches found.
In French Ligue 1:
→ 306 matches found.
In German Bundesliga:
→ 306 matches found.
In English Premier League:
→ 380 matches found.
In Spanish La Liga:
→ 380 matches found.
In UEFA Champions League:
→ 189 matches found.
In UEFA Europa League:
→ 189 matches found.
In UEFA Europa Conference League:
→ 153 matches found.


In [15]:
data_players_list = []
data_goals_list = []
data_teams_list = []

player_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_players_") and f.endswith(".csv")])
goal_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_goals_") and f.endswith(".csv")])
team_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_teams_") and f.endswith(".csv")])

player_nums = sorted([int(f.replace("data_players_", "").replace(".csv", "")) for f in player_files])
goal_nums = sorted([int(f.replace("data_goals_", "").replace(".csv", "")) for f in goal_files])
team_nums = sorted([int(f.replace("data_teams_", "").replace(".csv", "")) for f in team_files])

full_range = set(range(1, no_matches+1))

missing_players = sorted(full_range - set(player_nums))
missing_goals = sorted(full_range - set(goal_nums))
missing_teams = sorted(full_range - set(team_nums))

print("Missing players:", missing_players)
print("Missing goals  :", missing_goals)
print("Missing teams  :", missing_teams)

Missing players: [115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 31

In [16]:
warnings.filterwarnings("ignore")

no_games = 1
for i in range(len(url_leagues)):
    url_league = url_leagues[i]
    url_id_table = url_id_tab_list[i]
    league = leagues[i]
    print(f"For {league}:")
    file_path = os.path.join(path_folder, f"Leagues Games/{league}_games.csv")
    data_games_df = pd.read_csv(file_path, header=0, index_col=False)
    data_games_list = data_games_df.to_dict(orient="records")
    print(f"→ {len(data_games_list)} matches found.")

    for data_games in data_games_list:
        print(no_games)
        if pd.isna(data_games["Score"]) or data_games["Score"] in ("", None):
            no_games += 1
            continue
        
        game_week = data_games["Game Week"]
        home_team = data_games["Home Team"]
        away_team = data_games["Away Team"]
        url_game = data_games["URL"]
        
        if (no_games in missing_players) or (no_games in missing_goals) or (no_games in missing_teams):
            time.sleep(random.uniform(6, 10))
            try:
                data_players, data_goals = data_game(url_game, game_week, home_team, away_team, league, path_chrome)
                data_teams = stats_team(url_game, home_team, away_team, game_week, league, path_chrome)
                data_players.to_csv(os.path.join(path_folder_bis, f"raw data/Matches/data_players_{no_games}.csv"), index=False)
                data_goals.to_csv(os.path.join(path_folder_bis, f"raw data/Matches/data_goals_{no_games}.csv"), index=False)
                data_teams.to_csv(os.path.join(path_folder_bis, f"raw data/Matches/data_teams_{no_games}.csv"), index=True)
            except Exception as e:
                print(f"⚠️ Failed for {no_games}: {game_week}, {home_team} - {away_team}: {e}")
        
        no_games += 1

For Italian Serie A:
→ 380 matches found.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
26

In [17]:
data_players_list = []
data_goals_list = []
data_teams_list = []

player_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_players_") and f.endswith(".csv")])
goal_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_goals_") and f.endswith(".csv")])
team_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_teams_") and f.endswith(".csv")])

player_nums = sorted([int(f.replace("data_players_", "").replace(".csv", "")) for f in player_files])
goal_nums = sorted([int(f.replace("data_goals_", "").replace(".csv", "")) for f in goal_files])
team_nums = sorted([int(f.replace("data_teams_", "").replace(".csv", "")) for f in team_files])

full_range = set(range(1, no_matches+1))

missing_players = sorted(full_range - set(player_nums))
missing_goals = sorted(full_range - set(goal_nums))
missing_teams = sorted(full_range - set(team_nums))

print("Missing players:", missing_players)
print("Missing goals  :", missing_goals)
print("Missing teams  :", missing_teams)

Missing players: [119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 31

In [18]:
data_players_list = []
data_goals_list = []
data_teams_list = []

player_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_players_") and f.endswith(".csv")])
goal_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_goals_") and f.endswith(".csv")])
team_files = sorted([f for f in os.listdir(path_folder_bis / "raw data/Matches/") if f.startswith("data_teams_") and f.endswith(".csv")])

for file_name in player_files:
    file_path = os.path.join(path_folder_bis / "raw data/Matches/", file_name)
    data_players_list.append(pd.read_csv(file_path))

for file_name in goal_files:
    file_path = os.path.join(path_folder_bis / "raw data/Matches/", file_name)
    data_goals_list.append(pd.read_csv(file_path))
    
for file_name in team_files:
    file_path = os.path.join(path_folder_bis / "raw data/Matches/", file_name)
    data_teams_list.append(pd.read_csv(file_path))

df_players = pd.concat(data_players_list, ignore_index=True)
df_goals = pd.concat(data_goals_list, ignore_index=True)
df_teams = pd.concat(data_teams_list, ignore_index=True)

df_players = players_column_names(df_players, new_column_names_players)
df_goals = goals_column_names(df_goals, new_column_names_goals)

df_players.to_csv(os.path.join(path_folder_bis / "raw data/", "data_players.csv"), index=False)
df_goals.to_csv(os.path.join(path_folder_bis / "raw data/", "data_goals.csv"), index=False)
df_teams.to_csv(os.path.join(path_folder / "players/" , "clean/data_teams.csv"), index=False)

In [19]:
def xg_cumsum(url, team1, team2):
    tables = pd.read_html(url)
    df = tables[17]

    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(1)
    if df.iloc[0].astype(str).str.contains("Minute").any():
        df = df.drop(0).reset_index(drop=True)

    df = df.dropna(subset=["Minute"]).copy()
    df["Minute"] = df["Minute"].astype(str).str.extract(r"(\d+)")[0].astype(int)
    df.loc[df["Minute"] > 90, "Minute"] = 90
    df["xG"] = pd.to_numeric(df["xG"], errors="coerce").fillna(0)

    df_team1 = df[df["Squad"] == team1].groupby("Minute")["xG"].sum().cumsum()
    df_team2 = df[df["Squad"] == team2].groupby("Minute")["xG"].sum().cumsum()

    timeline = pd.DataFrame(index=range(1, 91))
    timeline[team1] = pd.Series(timeline.index).map(df_team1).ffill().fillna(0).values
    timeline[team2] = pd.Series(timeline.index).map(df_team2).ffill().fillna(0).values

    return timeline

url = "https://fbref.com/en/matches/79a943de/Nice-Lille-October-29-2025-Ligue-1"
team1 = 'Nice'
team2 = 'Lille'

#timeline = xg_cumsum(url, team1, team2)
#timeline.to_csv("/Users/matteolemesre/Desktop/xg.csv", index=False)