In [2]:
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [3]:
# load in variables
with open("URL_Constants.json") as f:
    url_constants = json.load(f)

In [4]:
# range of match data
years = list(range(2023, 2020, -1))
all_matches = []
league_season_url = url_constants["league_url"]

for year in years:
    page_html = requests.get(league_season_url)
    soup = BeautifulSoup(page_html.text)
    # access the final league table for the season
    league_table = soup.select('table.stats_table')[0]
    # get the link to each teams stats for said season
    team_links = [l.get("href") for l in league_table.find_all('a')]
    team_urls = [url_constants["base_url"] + l for l in team_links if '/squads/' in l]
    # set the previous seasons URL in order to continue loop
    previous_season = soup.select("a.prev")[0].get("href")
    league_season_url = url_constants["base_url"] + previous_season
    # sleep to avoid request rate limiting
    time.sleep(1)
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats","").replace("-"," ")
        team_html = requests.get(team_url)
        # read in match data as a dataframe
        matches_df = pd.read_html(team_html.text, match="Scores & Fixtures")
        time.sleep(1)
        # get the URL for the shooting data for the team
        soup = BeautifulSoup(team_html.text)
        stats_links = soup.find_all("a")
        stats_links = [l.get("href") for l in stats_links]
        stats_links = [l for l in stats_links if l and 'all_comps/shooting/' in l]
        team_shooting_url = url_constants["base_url"] + stats_links[0]
        # read the shooting data into a pandas dataframe
        shooting_html = requests.get(team_shooting_url)
        shooting_df = pd.read_html(shooting_html.text, match="Shooting")[0]
        shooting_df.columns = shooting_df.columns.droplevel()
        try:
            # merge the shooting data with fixture data to give more information
            team_data = matches_df[0].merge(shooting_df[["Date","Sh", "SoT", "Dist", "FK", "PK", "PKatt"]],on="Date")
        except ValueError:
            continue
        # filter data specifically to Premier league + add identifiers for team and year
        team_data = team_data[team_data["Comp"] == "Premier League"]
        team_data["Season"] = year
        team_data["Team"] = team_name 
        all_matches.append(team_data)
        print(f"{year} - {team_name}")
        time.sleep(1)
# concatenate all data and store in a csv
match_df = pd.concat(all_matches)
match_df.columns = match_df.columns.str.lower()
match_df.to_csv("matches.csv")

2023 - Manchester City
2023 - Arsenal
2023 - Manchester United
2023 - Newcastle United
2023 - Liverpool
2023 - Brighton and Hove Albion
2023 - Aston Villa
2023 - Tottenham Hotspur
2023 - Brentford
2023 - Fulham
2023 - Crystal Palace
2023 - Chelsea
2023 - Wolverhampton Wanderers
2023 - West Ham United
2023 - Bournemouth
2023 - Nottingham Forest
2023 - Everton
2023 - Leicester City
2023 - Leeds United
2023 - Southampton
2022 - Manchester City
2022 - Liverpool
2022 - Chelsea
2022 - Tottenham Hotspur
2022 - Arsenal
2022 - Manchester United
2022 - West Ham United
2022 - Leicester City
2022 - Brighton and Hove Albion
2022 - Wolverhampton Wanderers
2022 - Newcastle United
2022 - Crystal Palace
2022 - Brentford
2022 - Aston Villa
2022 - Southampton
2022 - Everton
2022 - Leeds United
2022 - Burnley
2022 - Watford
2022 - Norwich City
2021 - Manchester City
2021 - Manchester United
2021 - Liverpool
2021 - Chelsea
2021 - Leicester City
2021 - West Ham United
2021 - Tottenham Hotspur
2021 - Arsenal