In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [38]:
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore", category=FutureWarning)

##**Scrap first page with requests**

In [None]:
# url with premier league table
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
# extract html content as a string
data = requests.get(standings_url).text

##**Parse HTML using BeautifulSoup**

In [None]:
# parse the document using beautifulsoup
soup = BeautifulSoup(data, "html.parser")

In [None]:
# select the premier league table from the page
prem_table = soup.select('table.stats_table')[0]
prem_table

In [None]:
# find all the "a" tags inside the prem_table
links = prem_table.find_all("a")
# get href property of each link
links = [link.get("href") for link in links]
# filter the links to remain only with squad links
links = [link for link in links if "/squads/" in link]
# turn the links into full urls
team_urls = [f"https://fbref.com{link}" for link in links]

##**Get match stats using the Pandas Library and requests**

In [None]:
# extract html content of the first href
team_url = team_urls[0]
data = requests.get(team_url).text
# extract score and fixtures table using pandas library
tables = pd.read_html(data)
matches = [table for table in tables if "Referee" in table]
matches = matches[0]
matches.head()


  tables = pd.read_html(data)


Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,,,55,81145.0,Kyle Walker,4-2-3-1,Stuart Attwell,Match Report,Arsenal won on penalty kicks following normal ...
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,1.9,0.3,65,21572.0,Kevin De Bruyne,4-2-3-1,Craig Pawson,Match Report,
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,,,74,,Kyle Walker,4-2-3-1,François Letexier,Match Report,
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,1.0,0.3,59,53419.0,Kyle Walker,4-2-3-1,Robert Jones,Match Report,
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,3.5,0.7,79,31336.0,Kyle Walker,4-2-3-1,Jarred Gillett,Match Report,


##**Extract shooting data using pandas and requests**

In [None]:
# find the href link to the shooting page
soup = BeautifulSoup(data, "html.parser")
links = soup.find_all("a")
links = [link.get("href") for link in links]
links = [link for link in links if link and "all_comps/shooting/" in link]
link = links[0]
link

'/en/squads/b8fd03ef/2023-2024/matchlogs/all_comps/shooting/Manchester-City-Match-Logs-All-Competitions'

In [None]:
# extract the contents of the shooting page
data = requests.get(f"https://fbref.com/{link}").text
# extract shooting table using pandas library
shooting = pd.read_html(data)
shooting = shooting[0]

In [None]:
# remove multilevel index from the DataFrame
shooting.columns = shooting.columns.droplevel()
shooting.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,...,,,0,0,,,,,,Match Report
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,13.9,0.0,0,0,1.9,1.9,0.12,1.1,1.1,Match Report
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,...,,,0,0,,,,,,Match Report
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,17.9,0.0,0,0,1.0,1.0,0.07,0.0,0.0,Match Report
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,17.3,2.0,0,1,3.5,2.8,0.1,-1.5,-0.8,Match Report


##**Merge match and shooting data**

In [None]:
# merge matches and shooting DataFrames on date
team_stats = matches.merge(shooting[["Date","Sh","SoT","Dist","FK","PK","PKatt"]], on="Date")
team_stats.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,...,4-2-3-1,Stuart Attwell,Match Report,Arsenal won on penalty kicks following normal ...,8,4,,,0,0
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,4-2-3-1,Craig Pawson,Match Report,,17,8,13.9,0.0,0,0
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,...,4-2-3-1,François Letexier,Match Report,,23,7,,,0,0
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,4-2-3-1,Robert Jones,Match Report,,14,4,17.9,0.0,0,0
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,4-2-3-1,Jarred Gillett,Match Report,,29,9,17.3,2.0,0,1


In [None]:
matches.shape

(57, 19)

In [None]:
shooting.shape

(58, 26)

In [23]:
# set a list of years to scrape
years = list(range(2024,2022,-1))
years

[2024, 2023]

In [24]:
# initialize a list for all matches
all_matches = []

In [25]:
# url with premier league table
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
# loop through all the years
for year in years:
  # extract html content as a string
  data = requests.get(standings_url).text
  # parse the document using beautifulsoup
  soup = BeautifulSoup(data, "html.parser")

  # select the premier league table from the page
  prem_table = soup.select('table.stats_table')[0]

  # find all the "a" tags inside the prem_table
  links = prem_table.find_all("a")
  # get href property of each link
  links = [link.get("href") for link in links]
  # filter the links to remain only with squad links
  links = [link for link in links if "/squads/" in link]
  # turn the links into full urls
  team_urls = [f"https://fbref.com{link}" for link in links]

  # extract the url for the previous season
  prev_season = soup.select('a.prev')[0].get("href")
  # set standings url to the url of the previous season
  standings_url = f"https://fbref.com{prev_season}"

  # loop through all the team urls
  for team_url in team_urls:
    # get the team name
    team_name = team_url.split('/')[-1].replace("-Stats","").replace("-"," ")
    # extract html content of the first href
    data = requests.get(team_url).text

    # extract score and fixtures table using pandas library
    tables = pd.read_html(data)
    matches = [table for table in tables if "Referee" in table]
    matches = matches[0]

    # find the href link to the shooting page
    soup = BeautifulSoup(data, "html.parser")
    links = soup.find_all("a")
    links = [link.get("href") for link in links]
    links = [link for link in links if link and "all_comps/shooting/" in link]
    link = links[0]
    # extract the contents of the shooting page
    data = requests.get(f"https://fbref.com/{link}").text
    # extract shooting table using pandas library
    shooting = pd.read_html(data)
    shooting = shooting[0]
    # remove multilevel index from the DataFrame
    shooting.columns = shooting.columns.droplevel()
    try:
      # merge matches and shooting DataFrames on date
      team_stats = matches.merge(shooting[["Date","Sh","SoT","Dist","FK","PK","PKatt"]], on="Date")
    except ValueError:
      continue

    # filter team stats to only feature premier league matches
    team_stats = team_stats[team_stats["Comp"] == "Premier League"]
    # add team name and season to the DataFrame
    team_stats["Team"] = team_name
    team_stats["Season"] = year

    # add team stats to the all matches list
    all_matches.append(team_stats)

    # pause the loop for a second
    time.sleep(3)

In [39]:
# combine all individual DataFrames into one DataFrame
matches_df = pd.concat(all_matches)

In [40]:
# write all the data to a csv file
matches_df.to_csv("matches.csv")