In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment
import warnings
import time 

def scrape_player_stats(player_name, url, table_id="stats_standard", season=None):
    warnings.filterwarnings("ignore")
    headers = {'User-Agent': 'Mozilla/5.0'}

    response = requests.get(url, headers=headers,verify=False)
    soup = BeautifulSoup(response.content, 'html.parser')
    print(response.status_code, url)

    # Try to find table normally
    table = soup.find("table", id="stats_standard")
    print(table)

    # If not found, look in comments
    
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        if "stats_standard" in comment:
            print(f"Found{table_id} in comment for {player_name}")
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table', id=table_id)
            if table:
                break

    if not table:
        print(f"Table not found for {player_name}")
        return None

    # Try reading the table with pandas
    try:
        df = pd.read_html(str(table), header=1)[0]
    except Exception as e:
        print(f"Error parsing table for {player_name}: {e}")
        return None

    df['League'] = player_name
    df['Season'] = season

    return df

'''
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        if 'stats_standard' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table', id='stats_standard')
'''

"\n    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):\n        if 'stats_standard' in comment:\n            comment_soup = BeautifulSoup(comment, 'html.parser')\n            table = comment_soup.find('table', id='stats_standard')\n"

In [None]:
#https://fbref.com/en/comps/20/stats/Bundesliga-Stats
years = ["2024-2025","2023-2024","2022-2023","2021-2022","2020-2021","2019-2020","2018-2019","2017-2018","2016-2017","2015-2016"]
leagueinfo = [
    {
        "name": "Bundesliga",
        "url": "https://fbref.com/en/comps/20/{year}/stats/{year}-Bundesliga-Stats"
    },
    {
        "name": "Premier League",
        "url": "https://fbref.com/en/comps/9/{year}/stats/{year}-Premier-League-Stats"
    },
    {
        "name": "La Liga",
        "url": "https://fbref.com/en/comps/12/{year}/stats/{year}-La-Liga-Stats"
    },
    {
        "name": "Serie A",
        "url": "https://fbref.com/en/comps/11/{year}/stats/{year}-Serie-A-Stats"
    },
    {
        "name": "Ligue 1",
        "url": "https://fbref.com/en/comps/13/{year}/stats/{year}-Ligue-1-Stats"
    },
    {
        "name": "Eredivisie",
        "url": "https://fbref.com/en/comps/23/{year}/stats/{year}-Eredivisie-Stats"
    },
    {
        "name": "Primeira Liga",
        "url": "https://fbref.com/en/comps/32/{year}/stats/{year}-Primeira-Liga-Stats"
    },
    {
        "name": "Championship",
        "url": "https://fbref.com/en/comps/10/{year}/stats/{year}-EFL-Championship-Stats"
    },
    {
        "name": "Scottish Premiership",
        "url": "https://fbref.com/en/comps/40/{year}/stats/{year}-Scottish-Premiership-Stats"
    }
 
]

leagues = []
for year in years:
    for league in leagueinfo:
        leagues.append({
            "name": f"{league['name']} {year}",
            "url": league["url"].format(year=year),
            "season": year.replace("-", "/"),
        })

league_dfs = {}
combined_list = []
for league in leagues:
    df = scrape_player_stats(league["name"], league["url"])
    if df is not None:
        league_dfs[league["name"]] = df
        combined_list.append(df)
    else:
        print(f"Failed for {league['name']} at {league['url']}")
    time.sleep(5)  

200 https://fbref.com/en/comps/20/2024-2025/stats/2024-2025-Bundesliga-Stats
None
Foundstats_standard in comment for Bundesliga 2024-2025
200 https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats
None
Foundstats_standard in comment for Premier League 2024-2025
200 https://fbref.com/en/comps/12/2024-2025/stats/2024-2025-La-Liga-Stats
None
Foundstats_standard in comment for La Liga 2024-2025
200 https://fbref.com/en/comps/11/2024-2025/stats/2024-2025-Serie-A-Stats
None
Foundstats_standard in comment for Serie A 2024-2025
200 https://fbref.com/en/comps/13/2024-2025/stats/2024-2025-Ligue-1-Stats
None
Foundstats_standard in comment for Ligue 1 2024-2025
200 https://fbref.com/en/comps/23/2024-2025/stats/2024-2025-Eredivisie-Stats
None
Foundstats_standard in comment for Eredivisie 2024-2025
200 https://fbref.com/en/comps/32/2024-2025/stats/2024-2025-Primeira-Liga-Stats
None
Foundstats_standard in comment for Primeira Liga 2024-2025
200 https://fbref.com/en/comps/10/2024-

In [None]:
df = pd.concat(combined_list,ignore_index=True)

df

In [None]:
league_dfs =  pd.concat(combined_list, ignore_index=True)
league_dfs['League'].unique().tolist()

['Serie A 2024-2025',
 'Ligue 1 2024-2025',
 'La Liga 2019-2020',
 'Serie A 2019-2020',
 'Primeira Liga 2019-2020',
 'Serie A 2016-2017',
 'Eredivisie 2015-2016']

In [None]:
df = pd.concat(combined_list, ignore_index=True)
df['League'].unique().tolist()


['Serie A 2024-2025',
 'Ligue 1 2024-2025',
 'La Liga 2019-2020',
 'Serie A 2019-2020',
 'Primeira Liga 2019-2020',
 'Serie A 2016-2017',
 'Eredivisie 2015-2016']