In [21]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
pd.set_option('display.max_columns', None)
import warnings

def scrape_league_data(league_name, url, table_id=None):
    #print(f"Scraping {league_name}")
    warnings.filterwarnings("ignore")

    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers, verify= False)
    soup = BeautifulSoup(response.content, 'html.parser')
    print(response.status_code, url)

    table = soup.find("table", id=table_id) if table_id else soup.find("table")
    if not table:
        print(f" Table not found for {league_name}")
        return None

    df = pd.read_html(str(table))[0]
    df['League'] = league_name

    return df



years = ["2024-2025","2023-2024","2022-2023","2021-2022","2020-2021","2019-2020","2018-2019","2017-2018","2016-2017","2015-2016"]
leagueinfo = [
    {
        "name": "Bundesliga",
        "url": "https://fbref.com/en/comps/20/{year}/schedule/{year}-Bundesliga-Scores-and-Fixtures"
    },
    {
        "name": "Premier League",
        "url": "https://fbref.com/en/comps/9/{year}/schedule/{year}-Premier-League-Scores-and-Fixtures"
    },
    {
        "name": "Serie A",
        "url": "https://fbref.com/en/comps/11/{year}/schedule/{year}-Serie-A-Scores-and-Fixtures"
    },
    {
        "name": "La Liga",
        "url": "https://fbref.com/en/comps/12/{year}/schedule/{year}-La-Liga-Scores-and-Fixtures"
    },
    {
        "name": "Ligue 1",
        "url": "https://fbref.com/en/comps/13/{year}/schedule/{year}-Ligue-1-Scores-and-Fixtures"
    },
    {
        "name": "Eredivisie",
        "url": "https://fbref.com/en/comps/23/{year}/schedule/{year}-Eredivisie-Scores-and-Fixtures"
    },
    {        "name": "Primeira Liga",
        "url": "https://fbref.com/en/comps/32/{year}/schedule/{year}-Primeira-Liga-Scores-and-Fixtures"
    
    },
    {
        "name": "EFL Championship",
        "url": "https://fbref.com/en/comps/10/{year}/schedule/{year}-EFL-Championship-Scores-and-Fixtures"
    },
    {
        "name": "Scottish Premiership",
        "url": "https://fbref.com/en/comps/40/{year}/schedule/{year}-Scottish-Premiership-Scores-and-Fixtures"
    }
]


leagues = []
for year in years:
    for league in leagueinfo:
        leagues.append({
            "name": f"{league['name']} {year}",
            "url": league["url"].format(year=year),
        })

league_dfs = {}
combined_list = []
for league in leagues:
    df = scrape_league_data(league["name"], league["url"], table_id=None)
    if df is not None:
        league_dfs[league["name"]] = df
        combined_list.append(df)
    else:
        print(f"Failed for {league['name']} at {league['url']}")
    time.sleep(5) 

200 https://fbref.com/en/comps/20/2024-2025/schedule/2024-2025-Bundesliga-Scores-and-Fixtures
200 https://fbref.com/en/comps/9/2024-2025/schedule/2024-2025-Premier-League-Scores-and-Fixtures
200 https://fbref.com/en/comps/11/2024-2025/schedule/2024-2025-Serie-A-Scores-and-Fixtures
200 https://fbref.com/en/comps/12/2024-2025/schedule/2024-2025-La-Liga-Scores-and-Fixtures
200 https://fbref.com/en/comps/13/2024-2025/schedule/2024-2025-Ligue-1-Scores-and-Fixtures
200 https://fbref.com/en/comps/23/2024-2025/schedule/2024-2025-Eredivisie-Scores-and-Fixtures
200 https://fbref.com/en/comps/32/2024-2025/schedule/2024-2025-Primeira-Liga-Scores-and-Fixtures
200 https://fbref.com/en/comps/10/2024-2025/schedule/2024-2025-EFL-Championship-Scores-and-Fixtures
200 https://fbref.com/en/comps/40/2024-2025/schedule/2024-2025-Scottish-Premiership-Scores-and-Fixtures
200 https://fbref.com/en/comps/20/2023-2024/schedule/2023-2024-Bundesliga-Scores-and-Fixtures
200 https://fbref.com/en/comps/9/2023-2024/sche

KeyboardInterrupt: 

In [22]:
df2 = pd.concat(combined_list,ignore_index=True)

df2 = df2.drop(['Notes','Match Report'], axis=1)
df2

Unnamed: 0,Round,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,League
0,Bundesliga,1.0,Fri,2024-08-23,20:30,Gladbach,1.6,2–3,2.7,Leverkusen,54042.0,Stadion im Borussia-Park,Robert Schröder,Bundesliga 2024-2025
1,Bundesliga,1.0,Sat,2024-08-24,15:30,Hoffenheim,3.5,3–2,1.7,Holstein Kiel,18503.0,PreZero Arena,Tobias Stieler,Bundesliga 2024-2025
2,Bundesliga,1.0,Sat,2024-08-24,15:30,Mainz 05,1.2,1–1,0.6,Union Berlin,31500.0,Mewa Arena,Harm Osmers,Bundesliga 2024-2025
3,Bundesliga,1.0,Sat,2024-08-24,15:30,Augsburg,1.0,2–2,1.4,Werder Bremen,30660.0,WWK Arena,Sascha Stegemann,Bundesliga 2024-2025
4,Bundesliga,1.0,Sat,2024-08-24,15:30,Freiburg,2.1,3–1,0.4,Stuttgart,34700.0,Europa-Park Stadion,Tobias Welz,Bundesliga 2024-2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5146,,38.0,Sat,2024-05-25,21:00,Real Madrid,1.0,0–0,0.7,Betis,73614.0,Estadio Santiago Bernabéu,Isidro Díaz de Mera,La Liga 2023-2024
5147,,38.0,Sun,2024-05-26,14:00,Getafe,0.9,1–2,1.4,Mallorca,9545.0,Coliseum Alfonso Pérez,Víctor García,La Liga 2023-2024
5148,,38.0,Sun,2024-05-26,15:15,Las Palmas,1.0,1–1,2.5,Alavés,23043.0,Estadio de Gran Canaria,Francisco Hernández,La Liga 2023-2024
5149,,38.0,Sun,2024-05-26,16:15,Celta Vigo,1.5,2–2,2.0,Valencia,21878.0,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,La Liga 2023-2024


In [27]:
df2.columns

int64_col = ['Wk','Attendance']
float64_col = ['xG','xG.1']


for col in int64_col:
    df2[col] = pd.to_numeric(df2[col], errors='coerce').astype('Int64')
for col in float64_col:
    df2[col] = pd.to_numeric(df2[col], errors='coerce').astype('float64')

df2 = df2.astype({
    'Day': 'string',
    'Round':'string',
    'Day':'string',
    'Home': 'string',
    'Away': 'string',
    'Venue': 'string',
    'Referee': 'string',
    'League': 'string',
    'Score': 'string',
    'Date':'string',
    'Time':'string'

})
df2.dtypes

Round         string[python]
Wk                     Int64
Day           string[python]
Date          string[python]
Time          string[python]
Home          string[python]
xG                   float64
Score         string[python]
xG.1                 float64
Away          string[python]
Attendance             Int64
Venue         string[python]
Referee       string[python]
League        string[python]
dtype: object

In [32]:
df2.drop(['Round'], axis=1)

Unnamed: 0,Wk,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,League
0,1,Fri,2024-08-23,20:30,Gladbach,1.6,2–3,2.7,Leverkusen,54042,Stadion im Borussia-Park,Robert Schröder,Bundesliga 2024-2025
1,1,Sat,2024-08-24,15:30,Hoffenheim,3.5,3–2,1.7,Holstein Kiel,18503,PreZero Arena,Tobias Stieler,Bundesliga 2024-2025
2,1,Sat,2024-08-24,15:30,Mainz 05,1.2,1–1,0.6,Union Berlin,31500,Mewa Arena,Harm Osmers,Bundesliga 2024-2025
3,1,Sat,2024-08-24,15:30,Augsburg,1.0,2–2,1.4,Werder Bremen,30660,WWK Arena,Sascha Stegemann,Bundesliga 2024-2025
4,1,Sat,2024-08-24,15:30,Freiburg,2.1,3–1,0.4,Stuttgart,34700,Europa-Park Stadion,Tobias Welz,Bundesliga 2024-2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5146,38,Sat,2024-05-25,21:00,Real Madrid,1.0,0–0,0.7,Betis,73614,Estadio Santiago Bernabéu,Isidro Díaz de Mera,La Liga 2023-2024
5147,38,Sun,2024-05-26,14:00,Getafe,0.9,1–2,1.4,Mallorca,9545,Coliseum Alfonso Pérez,Víctor García,La Liga 2023-2024
5148,38,Sun,2024-05-26,15:15,Las Palmas,1.0,1–1,2.5,Alavés,23043,Estadio de Gran Canaria,Francisco Hernández,La Liga 2023-2024
5149,38,Sun,2024-05-26,16:15,Celta Vigo,1.5,2–2,2.0,Valencia,21878,Estadio Abanca Balaídos,Miguel Ángel Ortiz Arias,La Liga 2023-2024
