In [4]:
#imports 
import requests
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup
from time import sleep
from numpy import random

In [1]:
years = list(range(2023,2017,-1))
all_matches = []

In [8]:
headers = {'User-Agent': 'Mozilla/5.0'}
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [12]:
for year in years:
    print(f"---------- This is year: {year}")
    data = requests.get(standings_url,headers=headers)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]
    
    links = [l.get('href') for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    # go for previous season
    previous_seasons = soup.select("a.prev")[0].get('href')
    standings_url = f"https://fbref.com{previous_seasons}"
    
    sleeptime_year = random.uniform(1, 3)
    print("sleeping for:", sleeptime_year, "seconds")
    sleep(sleeptime_year)
    print("sleeping is over")
    
    
    for team_url in team_urls:
        team_name = team_url.split('/')[-1].replace('-Stats', '').replace('-', ' ')
        
        print(f"<<<<<<<< Scraping {team_name}>>>>>>>>")
        
        data = requests.get(team_url)
        matches = pd.read_html(StringIO(data.text), match="Scores & Fixtures")[0]
        
        soup = BeautifulSoup(data.text)
        links = [l.get('href') for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}", headers=headers)
        shooting = pd.read_html(StringIO(data.text), match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        
        
        try:
            team_data = matches.merge(shooting[['Date','PK', 'G/SoT', 'Dist', 'SoT%', 'npxG', 'FK', 'np:G-xG', 'G-xG', 'SoT', 'G/Sh', 'Sh', 'PKatt', 'npxG/Sh', 'Gls']], on='Date')
        
        except ValueError:
            # some teams might not have shooting stats, we will ignore it for now
            continue
        
        # filter for league only
        team_data = team_data[team_data['Comp'] == 'Premier League']
        team_data['Season'] = year
        team_data['Team'] = team_name
        all_matches.append(team_data)
        
        sleeptime = random.uniform(2, 4)
        print("sleeping for:", sleeptime, "seconds")
        sleep(sleeptime)
        print("sleeping is over")

---------- This is year: 2023
sleeping for: 1.8044872692924683 seconds
sleeping is over
<<<<<<<< Scraping Manchester City>>>>>>>>
sleeping for: 3.133579058599105 seconds
sleeping is over
<<<<<<<< Scraping Liverpool>>>>>>>>
sleeping for: 2.053377093918843 seconds
sleeping is over
<<<<<<<< Scraping Arsenal>>>>>>>>
sleeping for: 2.2833311884144623 seconds
sleeping is over
<<<<<<<< Scraping Tottenham Hotspur>>>>>>>>
sleeping for: 3.5083039906025704 seconds
sleeping is over
<<<<<<<< Scraping Aston Villa>>>>>>>>
sleeping for: 3.151691126729576 seconds
sleeping is over
<<<<<<<< Scraping Manchester United>>>>>>>>
sleeping for: 3.677387317575427 seconds
sleeping is over
<<<<<<<< Scraping Newcastle United>>>>>>>>
sleeping for: 3.233823364817783 seconds
sleeping is over
<<<<<<<< Scraping Brighton and Hove Albion>>>>>>>>
sleeping for: 2.8619220970148644 seconds
sleeping is over
<<<<<<<< Scraping West Ham United>>>>>>>>
sleeping for: 2.834593314479096 seconds
sleeping is over
<<<<<<<< Scraping Chel

In [13]:
# combine all the dataframes
matches_df = pd.concat(all_matches)

In [14]:
# lower case column names
matches_df.columns = [c.lower() for c in matches_df.columns]

In [15]:
matches_df.shape

(4040, 35)

In [17]:
matches_df.tail()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,np:g-xg,g-xg,sot,g/sh,sh,pkatt,npxg/sh,gls,season,team
35,2019-04-13,12:30,Premier League,Matchweek 34,Sat,Away,L,0,4,Tottenham,...,-0.8,-0.8,1.0,0.0,7.0,0,0.11,0,2018,Huddersfield Town
36,2019-04-20,15:00,Premier League,Matchweek 35,Sat,Home,L,1,2,Watford,...,0.0,0.0,3.0,0.08,13.0,0,0.08,1,2018,Huddersfield Town
37,2019-04-26,20:00,Premier League,Matchweek 36,Fri,Away,L,0,5,Liverpool,...,-0.2,-0.2,1.0,0.0,5.0,0,0.04,0,2018,Huddersfield Town
38,2019-05-05,14:00,Premier League,Matchweek 37,Sun,Home,D,1,1,Manchester Utd,...,0.0,0.0,3.0,0.14,7.0,0,0.14,1,2018,Huddersfield Town
39,2019-05-12,15:00,Premier League,Matchweek 38,Sun,Away,D,1,1,Southampton,...,0.3,0.3,3.0,0.1,10.0,0,0.07,1,2018,Huddersfield Town


In [18]:
matches_df.to_csv('data/matches_18_23.csv', index=False)

In [19]:
matches_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,np:g-xg,g-xg,sot,g/sh,sh,pkatt,npxg/sh,gls,season,team
1,2023-08-11,20:00,Premier League,Matchweek 1,Fri,Away,W,3,0,Burnley,...,1.1,1.1,8.0,0.18,17.0,0,0.12,3,2023,Manchester City
3,2023-08-19,20:00,Premier League,Matchweek 2,Sat,Home,W,1,0,Newcastle Utd,...,0.0,0.0,4.0,0.07,14.0,0,0.07,1,2023,Manchester City
4,2023-08-27,14:00,Premier League,Matchweek 3,Sun,Away,W,2,1,Sheffield Utd,...,-0.8,-1.5,9.0,0.07,29.0,1,0.1,2,2023,Manchester City
5,2023-09-02,15:00,Premier League,Matchweek 4,Sat,Home,W,5,1,Fulham,...,2.6,2.8,4.0,0.67,6.0,1,0.25,5,2023,Manchester City
6,2023-09-16,15:00,Premier League,Matchweek 5,Sat,Away,W,3,1,West Ham,...,-0.6,-0.6,13.0,0.1,29.0,0,0.13,3,2023,Manchester City
