In [None]:
# You may need to install these:
%pip install selenium webdriver-manager
import time

%pip install pandas lxml
import pandas as pd
import re # For regex operations

from io import StringIO
%pip install html5lib

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

In [3]:
#Full Script For Scraping Team Shooting Stats from FBRef
#This script will scrape the shooting stats for all teams in the Premier League for the 2024-2025 season.
#It uses Selenium to navigate the website and BeautifulSoup to parse the HTML.

# Set up Chrome options for headless browsing
# Encountered error with WSL connectivity, so using headless mode
#chrome_options = Options()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--headless')  # Run in headless mode
#chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
#driver = webdriver.Chrome(
#    service=Service(ChromeDriverManager().install()),
#    options=chrome_options
#)

driver.get("https://fbref.com/en/comps/9/2024-2025/Premier-League-Stats")
time.sleep(1) # Wait 10 seconds for the page to fully load
html = driver.page_source
#print(html)
soup = BeautifulSoup(html, "html.parser")
teams_table = soup.find("table", {"id": "results2024-202591_overall"})
if teams_table:
    links = teams_table.find_all("a", href=True)
    links = [l for l in links if '/squads/' in l["href"]]
else:
    print("Table not found.")


#Create Full URL Links to Teams
#You can use the following code to create full URLs for the team links:
base_url = "https://fbref.com"
team_urls = [base_url + link["href"] for link in links]

all_matches = pd.DataFrame()  # start empty DataFrame

for team_url in team_urls:
    driver.get(team_url)
    time.sleep(1) # Wait 10 seconds for the page to fully load
    html = driver.page_source
    matches = pd.read_html(StringIO(html), match="Scores & Fixtures")
    #print(matches[0].head())  # Display the first few rows of the first DataFrame


    #Get Full URL Link to Teams Shooting Stats
    soup = BeautifulSoup(html, "html.parser")
    teams_shooting_table = soup.find("a", string="Shooting")
    if teams_shooting_table and teams_shooting_table.has_attr("href"):
        print(teams_shooting_table["href"])
        shooting_url = base_url + teams_shooting_table["href"]
        driver.get(shooting_url)
        time.sleep(1) # Wait 10 seconds for the page to fully load
        html = driver.page_source
        shooting = pd.read_html(StringIO(html), match=re.compile(r"2024-2025.*: All Competitions"))
        shoot = shooting[0]
        shoot.columns = shoot.columns.droplevel()
    else:
        print("Shooting link not found.")

    try:
     match_df = matches[0]
     match_df = match_df.merge(shoot[["Date", "Comp", "Round", "Sh", "SoT", "Dist", "FK", "PK"]], on=["Date", "Comp", "Round"], how="left")
    except (ValueError, KeyError, IndexError):
     print(f"Error processing matches for {team_url}. Skipping this team.")
     continue   
    team_data = match_df[match_df["Comp"] == "Premier League"].copy()
    team_data["Season"] = "2024-2025"
    team_data["Team"] = team_url.split("/")[-1].replace("-Stats", " ").replace("-"," ")
    
    all_matches = pd.concat([all_matches, team_data], ignore_index=True)  # Combine all team data into a single DataFrame


driver.quit()

# Save the DataFrame to a CSV file
all_matches.to_csv("premier_league_2024_2025_team_stats.csv", index=False)
print("Data saved to premier_league_2024_2025_team_stats.csv") 

/en/squads/822bd0ba/2024-2025/matchlogs/all_comps/shooting/Liverpool-Match-Logs-All-Competitions
/en/squads/18bb7c10/2024-2025/matchlogs/all_comps/shooting/Arsenal-Match-Logs-All-Competitions
/en/squads/b8fd03ef/2024-2025/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions
/en/squads/cff3d9bb/2024-2025/matchlogs/all_comps/shooting/Chelsea-Match-Logs-All-Competitions
/en/squads/b2b47a98/2024-2025/matchlogs/all_comps/shooting/Newcastle-United-Match-Logs-All-Competitions
/en/squads/8602292d/2024-2025/matchlogs/all_comps/shooting/Aston-Villa-Match-Logs-All-Competitions
/en/squads/e4a775cb/2024-2025/matchlogs/all_comps/shooting/Nottingham-Forest-Match-Logs-All-Competitions
/en/squads/d07537b9/2024-2025/matchlogs/all_comps/shooting/Brighton-and-Hove-Albion-Match-Logs-All-Competitions
/en/squads/4ba7cbea/2024-2025/matchlogs/all_comps/shooting/Bournemouth-Match-Logs-All-Competitions
/en/squads/cd051869/2024-2025/matchlogs/all_comps/shooting/Brentford-Match-Logs-All-Competi