In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import os
from dotenv import load_dotenv

In [2]:
# opening the chrome
driver = webdriver.Chrome()
uefa = 'https://fbref.com/en/comps/8/2024-2025/2024-2025-Champions-League-Stats'

In [3]:
# opening the link
driver.get(uefa)

In [4]:
# finding the tables
tables = driver.find_elements(By.CLASS_NAME,"stats_table")

In [5]:
# first table
table = tables[0]

In [6]:
# Get the full HTML of the table
table_html = table.get_attribute('outerHTML')

from io import StringIO

# Convert to pandas DataFrame
df1 = pd.read_html(StringIO(table_html))[0]

In [7]:
# Drop the Notes column
df1 = df1.drop(columns=["Notes"])
df1 = df1[~df1.apply(lambda row: row.astype(str).str.contains("Squad").any(), axis=1)]

In [9]:
load_dotenv()

# Get the folder path from .env
data_path = os.getenv("DATA_PATH")
print("Saving to:", data_path)
# Save to CSV
df1.to_csv(f"{data_path}/standing.csv")

Saving to: D:/CodeShit/projects/Football-Fantasy-App/Data-Scraping/data2


In [10]:
base_url = "https://fbref.com" 

In [11]:
from bs4 import BeautifulSoup

In [12]:
soup = BeautifulSoup(table_html,"lxml")

In [13]:
teams_links1 = []
# find all <a> tags in the table
for a in soup.find_all("a", href=True):
    if "/squads/" in a["href"]:   # only squad links
        teams_links1.append("https://fbref.com" + a["href"])  # add base url

In [14]:
# get links for champions league data and team names
# takes about 2 min
team_links = []
team_name = []
for team_link in teams_links1:
    driver.get(team_link)
    team_links.append(driver.find_element(
        By.XPATH,
        "//a[contains(@href, 'squads') and contains(@href, 'Champions-League')]"
        ).get_attribute('href')
    )
    team_name.append(team_link.split("/")[-1].replace("-Stats", ""))

In [None]:
# takes about 2min

# Path to folder containing all CSVs
squads_path = os.getenv("SQUADS_PATH")
for team, teamName in zip(team_links, team_name):
    # open the link
    driver.get(team)
    
    # get the stats table for each player
    player_table = driver.find_elements(By.CLASS_NAME,"stats_table")[0]
    
    # get its html
    pt_html = player_table.get_attribute('outerHTML')
    
    # delete the useless multi header
    df = pd.read_html(StringIO(pt_html), header=1)[0]
    
    # remove the per90mins stats
    df = df.iloc[:, :-11]
    df = df.iloc[:-2, :]
    
    # remove the multi header row if in btw 
    df = df[~df.apply(lambda row: row.astype(str).str.contains("Performance|Gls").any(), axis=1)]

    # Reset index after cleaning
    df = df.reset_index(drop=True)
    
    #save it to /data
    df.to_csv(f"{squads_path}/{teamName}.csv", encoding="utf-8")
    
    time.sleep(2)

In [16]:
all_dfs = []

for filename in os.listdir(squads_path):
    if filename.endswith(".csv"):
        team_name = filename.replace(".csv", "")  # Get team name from filename
        df = pd.read_csv(os.path.join(squads_path, filename))
        
        # Add a column for Team name
        df["Team"] = team_name  
        
        all_dfs.append(df)

final_df = pd.concat(all_dfs,ignore_index=True)
# final_df.rename(columns={df.columns[0]: "Index"}, inplace=True)

final_df = final_df.drop(final_df.columns[1], axis=1)

# Save to one big CSV
final_df.to_csv(f"{data_path}/all_teams_ucl.csv", index=True, encoding="utf-8")
