In [7]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

def scrape_league_data(league_name, url, table_id=None, season="2024/25"):
    print(f"Scraping {league_name}...")

    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, verify=False)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find("table", id=table_id) if table_id else soup.find("table")
    if not table:
        print(f"❌ Table not found for {league_name}")
        return None

    df = pd.read_html(str(table))[0]
    df['League'] = league_name
    df['Season'] = season

    return df




In [None]:
# List of leagues to scrape
years = ["2024-2025","2023-2024","2022-2023"]
leagueinfo = [
    {
        "name": "Bundesliga",
        "url": "https://fbref.com/en/comps/20/{year}/schedule/{year}-Bundesliga-Scores-and-Fixtures"
    },
    {
        "name": "Premier League",
        "url": "https://fbref.com/en/comps/9/{year}/schedule/{year}-Premier-League-Scores-and-Fixtures"
    },
    {
        "name": "Serie A",
        "url": "https://fbref.com/en/comps/11/{year}/schedule/{year}-Serie-A-Scores-and-Fixtures"
    }
]

leagues = []
for year in years:
    for league in leagueinfo:
        leagues.append({
            "name": f"{league['name']} {year}",
            "url": league["url"].format(year=year),
            "season": year.replace("-", "/")
        })

league_dfs = {}
combined_list = []

for league in leagues:
    df = scrape_league_data(league["name"], league["url"])
    if df is not None:
        league_dfs[league["name"]] = df  # Store separately
        combined_list.append(df)         # Also save for merging


In [20]:
league_dfs

{'Bundesliga 2024-2025':                                          Round    Wk  Day        Date   Time  \
 0                                   Bundesliga   1.0  Fri  2024-08-23  20:30   
 1                                   Bundesliga   1.0  Sat  2024-08-24  15:30   
 2                                   Bundesliga   1.0  Sat  2024-08-24  15:30   
 3                                   Bundesliga   1.0  Sat  2024-08-24  15:30   
 4                                   Bundesliga   1.0  Sat  2024-08-24  15:30   
 ..                                         ...   ...  ...         ...    ...   
 337                                 Bundesliga  34.0  Sat  2025-05-17  15:30   
 338                                 Bundesliga  34.0  Sat  2025-05-17  15:30   
 339                                        NaN   NaN  NaN         NaN    NaN   
 340  German 1/2 Relegation/Promotion play-offs   NaN  Thu  2025-05-22  20:30   
 341  German 1/2 Relegation/Promotion play-offs   NaN  Mon  2025-05-26  20:30   
 
  

In [18]:
Serie_A_22_23 = df["Serie A 2022-2023"]

KeyError: 'Serie A 2022-2023'