In [24]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings

def scrape_league_data(league_name, url, table_id=None, season="2024/25"):
    warnings.filterwarnings("ignore")
    print(f"Scraping {league_name}...")

    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find("table", id=table_id) if table_id else soup.find("table")
    if not table:
        print(f"Table not found for {league_name}")
        return None

    df = pd.read_html(str(table))[0]
    df['League'] = league_name
    df['Season'] = season

    return df


In [None]:
# List of leagues to scrape
years = ["2024-2025","2023-2024","2022-2023","2021-2022","2020-2021","2019-2020","2018-2019","2017-2018","2016-2017","2015-2016"]
leagueinfo = [
    {
        "name": "Bundesliga",
        "url": "https://fbref.com/en/comps/20/{year}/schedule/{year}-Bundesliga-Scores-and-Fixtures"

    },
    {
        "name": "Premier League",
        "url": "https://fbref.com/en/comps/9/{year}/schedule/{year}-Premier-League-Scores-and-Fixtures"
    },
    {
        "name": "Serie A",
        "url": "https://fbref.com/en/comps/11/{year}/schedule/{year}-Serie-A-Scores-and-Fixtures"
    }
    ,
    {
        "name": "La Liga",
        "url": "https://fbref.com/en/comps/12/{year}/schedule/{year}-La-Liga-Scores-and-Fixtures"
    },
    {
        "name": "Ligue 1",
        "url": "https://fbref.com/en/comps/13/{year}/schedule/{year}-Ligue-1-Scores-and-Fixtures"
    },
    {
        "name": "Eredivisie",
        "url": "https://fbref.com/en/comps/20/{year}/schedule/{year}-Eredivisie-Scores-and-Fixtures"
    },
    {
        "name": "Primeira Liga",
        "url": "https://fbref.com/en/comps/19/{year}/schedule/{year}-Primeira-Liga-Scores-and-Fixtures"
    },
    {
        "name": "EFL Championship",
        "url": "https://fbref.com/en/comps/10/{year}/schedule/{year}-EFL-Championship-Scores-and-Fixtures"
    }

]

leagues = []
for year in years:
    for league in leagueinfo:
        leagues.append({
            "name": f"{league['name']} {year}",
            "url": league["url"].format(year=year),
            "season": year.replace("-", "/")
        })

league_dfs = {}
combined_list = []

for league in leagues:
    df = scrape_league_data(league["name"], league["url"])
    if df is not None:
        league_dfs[league["name"]] = df 
        combined_list.append(df)        


In [32]:
combined_list

[       Wk  Day        Date   Time           Home   xG Score  xG.1  \
 0     1.0  Fri  2023-08-11  19:30        Almería  1.4   0–2   2.1   
 1     1.0  Fri  2023-08-11  22:00        Sevilla  0.7   1–2   1.1   
 2     1.0  Sat  2023-08-12  17:00  Real Sociedad  1.0   1–1   0.8   
 3     1.0  Sat  2023-08-12  18:30     Las Palmas  0.9   1–1   1.6   
 4     1.0  Sat  2023-08-12  21:30  Athletic Club  0.4   0–2   0.9   
 ..    ...  ...         ...    ...            ...  ...   ...   ...   
 416  38.0  Sat  2024-05-25  21:00    Real Madrid  1.0   0–0   0.7   
 417  38.0  Sun  2024-05-26  14:00         Getafe  0.9   1–2   1.4   
 418  38.0  Sun  2024-05-26  15:15     Las Palmas  1.0   1–1   2.5   
 419  38.0  Sun  2024-05-26  16:15     Celta Vigo  1.5   2–2   2.0   
 420  38.0  Sun  2024-05-26  21:00        Sevilla  1.5   1–2   1.3   
 
                Away  Attendance                          Venue  \
 0    Rayo Vallecano     14837.0            Power Horse Stadium   
 1          Valencia    