In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
from io import StringIO
from pathlib import Path

In [22]:
season_code = "25_26"
current_dir = Path.cwd() 
path_folder = current_dir.parent.parent.parent / "csv" / f"csv{season_code}" / "raw data teams"
path_folder_end = current_dir.parent.parent / "csv" / f"csv{season_code}" / "teams"

In [23]:
current_dir = Path.cwd()
path_chrome = current_dir.parent.parent.parent.parent / "chromedriver2" / "chromedriver"

# Scrapping

In [24]:
url_template = "https://fbref.com/en/comps/"
url_end       = "-Stats"

url_stats     = ["stats", "keepers", "keepersadv", "shooting", "passing", "passing_types", "gca", "defense", "possession", "misc"]

league_infos = [
    ["Italian Serie A", "11", "Serie-A"],
    ["French Ligue 1", "13", "Ligue-1"],
    ["German Bundesliga", "20", "Bundesliga"],
    ["English Premier League", "9", "Premier-League"],
    ["Spanish La Liga", "12", "La-Liga"],
    ["UEFA Champions League", "8", "Champions-League"],
    ["UEFA Europa League", "19", "Europa-League"],
    ["UEFA Europa Conference League", "882", "Conference-League"]
]

leagues_folder = "Teams"

In [25]:
def data_stat(url_stats, league_name, path_chrome):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/17.0"
    )
    service = Service(path_chrome)
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url_stats)
    time.sleep(random.uniform(2, 3))
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    html = driver.page_source
    driver.quit()
    soup = BeautifulSoup(html, "html.parser")
    
    tables = soup.find_all("table")
    data = pd.read_html(StringIO(str(tables)))[0]
    if league_name in ["UEFA Champions League", "UEFA Europa League", "UEFA Europa Conference League"]:
        data[('Unnamed: 0_level_0',    'Squad')] = data[('Unnamed: 0_level_0',    'Squad')].astype(str).str.replace(r'^[a-z]{2,3}\s+', '', regex=True)
    return data

In [26]:
def data_stats(league_infos, url_stats, url_template, url_end, path_chrome):
    df_league = pd.DataFrame()
    
    for i, stat in enumerate(url_stats):
        url = url_template + league_infos[1] + "/" + stat + "/" + league_infos[2] + url_end
        df_stats = data_stat(url, league_infos[0], path_chrome)
        df_league = pd.concat([df_league, df_stats], axis=1)
        
    return df_league

In [None]:
def data_stats_leagues(leagues_folder, leagues_infos, url_stats, url_template, url_end, path_chrome):
    
    for league_infos in leagues_infos:
        print(f"- {league_infos[0]}")
        df_league = data_stats(league_infos, url_stats, url_template, url_end, path_chrome)
        file_end_league = path_folder / leagues_folder / f"{league_infos[0]}.csv"
        df_league.to_csv(file_end_league, index=False)
        if df_league.shape[1] == 228:
            print(f" --> Validated") 
        else:
            print(f" --> Refused â€” got {df_league.shape[1]} columns")
    return 

In [28]:
data_stats_leagues(leagues_folder, league_infos, url_stats, url_template, url_end, path_chrome)

- Italian Serie A


TypeError: bad operand type for unary +: 'str'

In [None]:
df_leagues = pd.DataFrame()
    
path = path_folder / leagues_folder
    
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        file_path = os.path.join(path, filename)
        df = pd.read_csv(file_path, header=[0,1])
        df_leagues = pd.concat([df_leagues, df], axis=0)
                
df_leagues.to_csv(os.path.join(path_folder, f"{leagues_folder}.csv"), index=False)

# Cleaning

In [None]:
def extract_and_rename_stats(df):
    df.columns = [
        f"{top.strip()} - {bottom.strip()}" if "Unnamed" not in top else bottom.strip()
        for top, bottom in df.columns
    ]
    df.columns = df.columns.astype(str).str.strip()
    df.columns.name = None

    columns_rename_map = {
        'Squad': 'Team',
        "Age": "Average Age",
        "Poss": "Possession",
        'Playing Time - MP': 'Matches Played',
        
        "Performance - Gls": "Goals",
        "Expected - xG": "xG",
        "Standard - Sh": "Shots",
        "Standard - SoT": "Shots on Target",
        "Standard - PK": "Penalties Scored",
        "Standard - PKatt": "Penalties Attempted",
        "SCA - SCA": "Shot Creating Actions",
        "GCA - GCA": "Goal Creating Actions",
        
        "Total - Cmp": "Passes Completed (Total)",
        "Total - Att": "Passes Attempted (Total)",
        "Short - Cmp": "Passes Completed (Short)",
        "Short - Att": "Passes Attempted (Short)",
        "Medium - Cmp": "Passes Completed (Medium)",
        "Medium - Att": "Passes Attempted (Medium)",
        "Long - Cmp": "Passes Completed (Long)",
        "Long - Att": "Passes Attempted (Long)",
        
        "KP": "Key Passes",
        "1/3": "Passes into Final Third",
        "PPA": "Passes into Penalty Area",
        "CrsPA": "Crosses into Penalty Area",
        "PrgP": "Progressive Passes",
        
        "Carries - PrgC": "Progressive Carries",
        "Receiving - PrgR": "Progressive Runs",
        
        "Tackles - Tkl": "Tackles",
        "Tackles - TklW": "Tackles Won",
        "Tackles - Def 3rd": "Tackles Defensive Third",
        "Tackles - Mid 3rd": "Tackles Middle Third",
        "Tackles - Att 3rd": "Tackles Attacking Third",
        
        "Challenges - Tkl": "Challenges Tackled",
        "Challenges - Att": "Challenges Attempted",
        "Challenges - Lost": "Challenges Lost",
        
        "Blocks - Blocks": "Blocks",
        
        "Int": "Interceptions",
        "Clr": "Clearances",
        "Err": "Errors",
        
        "Touches - Touches": "Touches",
        "Touches - Def Pen": "Touches Defensive Penalty Area",
        "Touches - Def 3rd": "Touches Defensive Third",
        "Touches - Mid 3rd": "Touches Middle Third",
        "Touches - Att 3rd": "Touches Attacking Third",
        "Touches - Att Pen": "Touches Attacking Penalty Area",
        "Touches - Live": "Live-Ball Touches",
        
        "Take-Ons - Att": "Take-Ons Attempted",
        "Take-Ons - Succ": "Successful Take-Ons",
        
        "Carries - Carries": "Carries",
        "Carries - PrgC": "Progressive Carries",
        "Carries - 1/3": "Carries into Final Third",
        "Carries - CPA": "Carries into Penalty Area",
        "Carries - Mis": "Miscontrols",
        "Carries - Dis": "Dispossessed",
        
        "Receiving - Rec": "Passes Received",
        "Receiving - PrgR": "Progressive Passes Received",
        
        "Performance - CrdY": "Yellow Cards",
        "Performance - CrdR": "Red Cards",
        "Performance - 2CrdY": "Second Yellow Cards",
        "Performance - Fls": "Fouls Committed",
        "Performance - Fld": "Fouls Drawn",
        "Performance - Off": "Offsides",
        "Performance - PKwon": "Penalties Won",
        "Performance - PKcon": "Penalties Conceded",
        "Performance - OG": "Own Goals",
        "Performance - Recov": "Ball Recoveries",
        
        "Aerial Duels - Won": "Aerial Duels Won",
        "Aerial Duels - Lost": "Aerial Duels Lost",

        'Performance - GA': 'Goals Against',
        'Performance - SoTA': 'Shots on Target Against',
        'Performance - Saves': 'Saves',
        'Performance - CS': 'Clean Sheets',

        'Penalty Kicks - PKatt': 'Penalty Kicks Attempted',
        'Penalty Kicks - PKA': 'Penalty Kicks Allowed',
        'Penalty Kicks - PKsv': 'Penalty Kicks Saved',
        'Penalty Kicks - PKm': 'Penalty Kicks Missed',

        'Expected - PSxG': 'Post-Shot Expected Goals',

        'Launched - Cmp': 'Launched Passes Completed',
        'Launched - Att': 'Launched Passes Attempted',

        'Passes - Att (GK)': 'Passes Attempted (GK)',
        'Passes - Thr': 'Through Balls',

        'Goal Kicks - Att': 'Goal Kicks Attempted',

        'Crosses - Opp': 'Crosses Opposed',
        'Crosses - Stp': 'Crosses Stopped',

        'Sweeper - #OPA': 'Sweeper Actions',
    }

    valid_columns = [col for col in columns_rename_map.keys() if col in df.columns]
    df_selected = df[valid_columns].copy()

    df_selected.rename(columns=columns_rename_map, inplace=True)

    df_selected.replace([np.inf, -np.inf], 0, inplace=True)
    df_selected.fillna(0, inplace=True)

    for col in df_selected.select_dtypes(include=['float64', 'int64']).columns:
        df_selected[col] = df_selected[col].round(2)
    
    df_selected = df_selected.loc[:, ~df_selected.columns.duplicated()]

    return df_selected

In [None]:
path = path_folder / f"{leagues_folder}.csv"
df = pd.read_csv(path, header=[0, 1], low_memory=False)
df = extract_and_rename_stats(df)
path_end = path_folder_end  / f"{leagues_folder}.csv"
df.to_csv(path_end, index=False)