# Premier League Data Scrapping

### Libraries

In [12]:
import time
import random
import requests
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup

### Scraping first page with requests

In [3]:
standings_url = "https://fbref.com/en/comps/9/2023-2024/2023-2024-Premier-League-Stats"

In [4]:
data = requests.get(standings_url)

### Parsing HTML links with BeautifulSoup

In [5]:
soup = BeautifulSoup(data.text)

In [6]:
standings_table = soup.select("table.stats_table")[0]

In [7]:
links = standings_table.find_all('a')

In [8]:
links = [l.get("href") for l in links]

In [9]:
links = [l for l in links if '/squads/' in l]

In [10]:
team_urls = [f"https://fbref.com{l}" for l in links]

### Extract match stats using pandas and requests

In [11]:
team_url = team_urls[0]

In [12]:
data = requests.get(team_url)
str_data = StringIO(data.text)

In [13]:
matches = pd.read_html(str_data, match='Scores & Fixtures')

### Get matches shooting stats with BeautifulSoup and requests

In [14]:
soup = BeautifulSoup(data.text)

In [15]:
links = soup.find_all('a')

In [16]:
links = [l.get("href") for l in links]

In [17]:
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [18]:
data = requests.get(f"https://fbref.com{links[0]}")
str_data = StringIO(data.text)

In [19]:
shooting = pd.read_html(str_data, match="Shooting")[0]

### Cleaning scraped data with pandas

In [20]:
shooting.columns = shooting.columns.droplevel()

### Merging Shooting and Scores & Fixture tables

In [21]:
team_data = matches[0].merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [22]:
team_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Opp Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2023-08-06,16:00,Community Shield,FA Community Shield,Sun,Neutral,D,1 (1),1 (4),Arsenal,...,4-3-3,Stuart Attwell,Match Report,Arsenal won on penalty kicks following normal ...,8,4,,,0,0
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,5-4-1,Craig Pawson,Match Report,,17,8,13.9,0.0,0,0
2,2023-08-16,22:00,Super Cup,UEFA Super Cup,Wed,Home,D,1 (5),1 (4),es Sevilla,...,4-2-3-1,François Letexier,Match Report,,23,7,,,0,0
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,4-3-3,Robert Jones,Match Report,,14,4,17.9,0.0,0,0
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,3-5-2,Jarred Gillett,Match Report,,29,9,17.3,2.0,0,1


### Scraping data for multiple season and teams with a loop

In [8]:
years = list(range(2011, 2024))

In [9]:
all_matches = []

In [13]:
for year in years:
    print("Retrieving data for year", year)

    standings_url = f"https://fbref.com/en/comps/9/{year}-{year + 1}/{year}-{year + 1}-Premier-League-Stats"
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text, 'html.parser')

    standings_table = soup.select('table.stats_table')
    standings_table = standings_table[0]

    # Find all team URLs
    links = [l.get('href') for l in standings_table.find_all('a') if '/squads' in l.get('href')]
    team_urls = [f'https://fbref.com{l}' for l in links]

    for team_url in team_urls:
        # Get team name
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', ' ')

        # Get team's matches (Scores & Fixtures)
        data = requests.get(team_url)
        str_data = StringIO(data.text)
        matches = pd.read_html(str_data, match='Scores & Fixtures')[0]

        # Parse the team page for the shooting data link
        soup = BeautifulSoup(data.text, 'html.parser')
        links = [l.get('href') for l in soup.find_all('a') if l.get('href')]

        # Find the shooting stats URL
        shooting_links = [l for l in links if 'all_comps/shooting/' in l]
        if not shooting_links:
            continue

        # Get shooting data
        shooting_url = f"https://fbref.com{shooting_links[0]}"
        data = requests.get(shooting_url)
        str_data = StringIO(data.text)

        try:
            shooting = pd.read_html(str_data, match="Shooting")[0]
            shooting.columns = shooting.columns.droplevel()  # Dropping multi-level header if exists
        except ValueError:
            continue  # Skip if Shooting table isn't found

        # Ensure all required columns are present in the shooting dataframe
        for col in ["Sh", "SoT", "Dist", "FK", "PK", "PKatt"]:
            if col not in shooting.columns:
                shooting[col] = 0  # Fill missing columns with 0

        # Try merging matches and shooting data
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date", how="left")
        except ValueError:
            continue

        # Filter for Premier League matches only
        team_data = team_data[team_data['Comp'] == 'Premier League']

        # Add season and team information
        team_data['Season'] = year
        team_data['Team'] = team_name

        # Append team data to the list of all matches
        all_matches.append(team_data)

        time.sleep(7)

Retrieving data for year 2011
Retrieving data for year 2012
Retrieving data for year 2013
Retrieving data for year 2014
Retrieving data for year 2015
Retrieving data for year 2016
Retrieving data for year 2017
Retrieving data for year 2018
Retrieving data for year 2019
Retrieving data for year 2020
Retrieving data for year 2021
Retrieving data for year 2022
Retrieving data for year 2023


In [14]:
match_df = pd.concat(all_matches)

In [15]:
match_df.columns = [c.lower() for c in match_df.columns]

In [None]:
match_df_cleaned = match_df.drop_duplicates(subset=["date", "team", "opponent", "season"])

In [16]:
match_df.to_csv("../data/premier_league_data.csv", index=False)

In [17]:
match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,sh,sot,dist,fk,pk,pkatt,season,team,xg,xga
0,2011-08-15,,Premier League,Matchweek 1,Mon,Home,W,4,0,Swansea City,...,,,,0.0,0.0,,2011,Manchester City,,
1,2011-08-21,,Premier League,Matchweek 2,Sun,Away,W,3,2,Bolton,...,,,,0.0,0.0,,2011,Manchester City,,
2,2011-08-28,,Premier League,Matchweek 3,Sun,Away,W,5,1,Tottenham,...,,,,0.0,0.0,,2011,Manchester City,,
3,2011-09-10,,Premier League,Matchweek 4,Sat,Home,W,3,0,Wigan Athletic,...,,,,0.0,0.0,,2011,Manchester City,,
5,2011-09-18,,Premier League,Matchweek 5,Sun,Away,D,2,2,Fulham,...,,,,0.0,0.0,,2011,Manchester City,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36,2024-04-24,20:00,Premier League,Matchweek 29,Wed,Away,L,2,4,Manchester Utd,...,10.0,4.0,17.8,1.0,0.0,0.0,2023,Sheffield United,0.8,2.9
37,2024-04-27,15:00,Premier League,Matchweek 35,Sat,Away,L,1,5,Newcastle Utd,...,15.0,4.0,13.5,0.0,0.0,0.0,2023,Sheffield United,1.5,3.1
38,2024-05-04,15:00,Premier League,Matchweek 36,Sat,Home,L,1,3,Nott'ham Forest,...,16.0,4.0,18.0,0.0,1.0,1.0,2023,Sheffield United,2.2,1.5
39,2024-05-11,15:00,Premier League,Matchweek 37,Sat,Away,L,0,1,Everton,...,13.0,1.0,21.0,0.0,0.0,0.0,2023,Sheffield United,0.6,2.3
