In [54]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import warnings

def scrape_league_data(league_name, url, table_id=None, season="2024/25"):
    warnings.filterwarnings("ignore")
    print(f"Scraping {league_name}...")

    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find("table", id=table_id) if table_id else soup.find("table")
    if not table:
        print(f"Table not found for {league_name}")
        return None

    df = pd.read_html(str(table))[0]
    df['League'] = league_name
    df['Season'] = season

    return df


In [55]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment
import warnings

def scrape_league_data(league_name, url, table_id=None, season="2024/25"):
    warnings.filterwarnings("ignore")
    print(f"Scraping {league_name}...")

    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find("table", id="stats_squads_standard_for") if table_id else soup.find("table")
    if not table:
        # If not found, look inside HTML comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find("table", id="stats_squads_standard_for") if table_id else comment_soup.find("table")
            if table:
                break

    if not table:
        print(f"Table not found for {league_name}")
        return None

    df = pd.read_html(str(table))[0]
    df['League'] = league_name
    df['Season'] = season

    return df

In [56]:
# List of leagues to scrape
years = ["2024-2025","2023-2024","2022-2023","2021-2022","2020-2021","2019-2020","2018-2019","2017-2018","2016-2017","2015-2016"]
leagueinfo = [
    {
        "name": "Bundesliga",
        "url": "https://fbref.com/en/comps/20/{year}/schedule/{year}-Bundesliga-Scores-and-Fixtures"

    },
    {
        "name": "Premier League",
        "url": "https://fbref.com/en/comps/9/{year}/schedule/{year}-Premier-League-Scores-and-Fixtures"
    },
    {
        "name": "Serie A",
        "url": "https://fbref.com/en/comps/11/{year}/schedule/{year}-Serie-A-Scores-and-Fixtures"
    }
    ,
    {
        "name": "La Liga",
        "url": "https://fbref.com/en/comps/12/{year}/schedule/{year}-La-Liga-Scores-and-Fixtures"
    },
    {
        "name": "Ligue 1",
        "url": "https://fbref.com/en/comps/13/{year}/schedule/{year}-Ligue-1-Scores-and-Fixtures"
    },
    {
        "name": "Eredivisie",
        "url": "https://fbref.com/en/comps/20/{year}/schedule/{year}-Eredivisie-Scores-and-Fixtures"
    },
    {
        "name": "Primeira Liga",
        "url": "https://fbref.com/en/comps/19/{year}/schedule/{year}-Primeira-Liga-Scores-and-Fixtures"
    },
    {
        "name": "EFL Championship",
        "url": "https://fbref.com/en/comps/10/{year}/schedule/{year}-EFL-Championship-Scores-and-Fixtures"
    }

]

leagues = []
for year in years:
    for league in leagueinfo:
        leagues.append({
            "name": f"{league['name']} {year}",
            "url": league["url"].format(year=year),
            "season": year.replace("-", "/")
        })

league_dfs = {}
combined_list = []

for league in leagues:
    df = scrape_league_data(league["name"], league["url"])
    if df is not None:
        league_dfs[league["name"]] = df 
        combined_list.append(df)        


Scraping Bundesliga 2024-2025...
Table not found for Bundesliga 2024-2025
Scraping Premier League 2024-2025...
Table not found for Premier League 2024-2025
Scraping Serie A 2024-2025...
Table not found for Serie A 2024-2025
Scraping La Liga 2024-2025...
Table not found for La Liga 2024-2025
Scraping Ligue 1 2024-2025...
Table not found for Ligue 1 2024-2025
Scraping Eredivisie 2024-2025...
Table not found for Eredivisie 2024-2025
Scraping Primeira Liga 2024-2025...
Table not found for Primeira Liga 2024-2025
Scraping EFL Championship 2024-2025...
Table not found for EFL Championship 2024-2025
Scraping Bundesliga 2023-2024...
Table not found for Bundesliga 2023-2024
Scraping Premier League 2023-2024...
Table not found for Premier League 2023-2024
Scraping Serie A 2023-2024...
Table not found for Serie A 2023-2024
Scraping La Liga 2023-2024...
Table not found for La Liga 2023-2024
Scraping Ligue 1 2023-2024...
Table not found for Ligue 1 2023-2024
Scraping Eredivisie 2023-2024...
Table n

In [57]:
combined_list

[                                         Round    Wk  Day        Date   Time  \
 0                                   Bundesliga   1.0  Fri  2022-08-05  20:30   
 1                                   Bundesliga   1.0  Sat  2022-08-06  15:30   
 2                                   Bundesliga   1.0  Sat  2022-08-06  15:30   
 3                                   Bundesliga   1.0  Sat  2022-08-06  15:30   
 4                                   Bundesliga   1.0  Sat  2022-08-06  15:30   
 ..                                         ...   ...  ...         ...    ...   
 337                                 Bundesliga  34.0  Sat  2023-05-27  15:30   
 338                                 Bundesliga  34.0  Sat  2023-05-27  15:30   
 339                                        NaN   NaN  NaN         NaN    NaN   
 340  German 1/2 Relegation/Promotion play-offs   NaN  Thu  2023-06-01  20:45   
 341  German 1/2 Relegation/Promotion play-offs   NaN  Mon  2023-06-05  20:45   
 
                Home   xG 

In [58]:
df = pd.concat(combined_list, ignore_index=True)
df['League'].unique()


array(['Bundesliga 2022-2023', 'Primeira Liga 2018-2019',
       'Primeira Liga 2016-2017'], dtype=object)