In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import time
from requests.exceptions import RequestException

def fetch_data_notebook(url, retries=3, delay=2):
    """
    Fetches data from the given URL with retry and error handling for Jupyter Notebook.

    :param url: The URL to fetch data from.
    :param retries: Number of retries in case of failure.
    :param delay: Delay (in seconds) between retries.
    :return: Response object if successful, or None if all retries fail.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        'Accept-Language': 'en-US,en;q=0.9',
        'Referer': 'https://google.com'
    }

    for attempt in range(retries):
        try:
            print(f"Fetching data from: {url} (Attempt {attempt + 1}/{retries})")
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                print("Data fetched successfully!")
                return response
            else:
                print(f"Unexpected status code {response.status_code}. Retrying...")
        except RequestException as e:
            print(f"Error: {e}. Retrying...")

        time.sleep(delay)
    
    print("Failed to fetch data after multiple retries.")
    return None

In [2]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
data = fetch_data_notebook(standings_url)#requests.get(standings_url)

In [None]:
years = list(range(2025, 2023, -1))
print(years)
all_matches = []

In [6]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
counter = 1

In [None]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    print(previous_season)
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        print(f'now is {counter}')
        counter+=1
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        try:
            shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
        except Exception:
            continue
            
        print(shooting)
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(20)

In [None]:
len(all_matches)

In [None]:
all_matches

In [None]:
match_df = pd.concat(all_matches)

In [10]:
match_df.columns = [c.lower() for c in match_df.columns]

In [None]:
match_df

In [12]:
match_df.to_csv("matches2.csv")