In [1]:
import requests

import pandas as pd

from bs4 import BeautifulSoup

import time

In [2]:
years = list(range(2024,2021, -1))

In [3]:
years

[2024, 2023, 2022]

In [4]:
all_matches =[]

In [5]:
stats_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [6]:
connect_timeout = 6
read_timeout = 100

for year in years:
    
    data = requests.get(stats_url, timeout=(connect_timeout, read_timeout))
    soup = BeautifulSoup(data.text)
    season_table = soup.select('table.stats_table')[0]
    
    team_links = [h for h in (l.get("href") for l in (season_table.find_all('a'))) if '/squads/' in h]
    team_urls = [f"https://fbref.com{l}" for l in team_links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    stats_url = f"https://fbref.com{previous_season}"
    
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        team_data = requests.get(team_url, timeout=(connect_timeout, read_timeout))
        
        team_matches = pd.read_html(team_data.text, match="Scores & Fixtures")[0]
        
        
        shooting_soup = BeautifulSoup(team_data.text)

        shooting_links = [h for h in (l.get("href") for l in (shooting_soup.find_all('a'))) if h and 'all_comps/shooting/' in h]
        
        shooting_data = requests.get(f"https://fbref.com{shooting_links[0]}", timeout=(connect_timeout, read_timeout))

        shooting = pd.read_html(shooting_data.text, match="Shooting")[0]
        
        shooting.columns = shooting.columns.droplevel()
        
        try:
            team_info = team_matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on='Date')
        except ValueError:
            continue
        
        team_info = team_info[team_info["Comp"] == "Premier League"]
        
        team_info["Season"] = year
        team_info["Team"] = team_name
        all_matches.append(team_info)
        
        time.sleep(5)
        
    
        

In [7]:
match_df = pd.concat(all_matches)

In [8]:
match_df.shape

(2234, 27)

In [9]:
match_df['Season'].value_counts()

2023    760
2022    760
2024    714
Name: Season, dtype: int64

In [10]:
match_df[match_df['Season']==2024]["Team"].value_counts()

Arsenal                     36
Wolverhampton Wanderers     36
Burnley                     36
Luton Town                  36
Nottingham Forest           36
Brentford                   36
Everton                     36
Crystal Palace              36
Fulham                      36
Sheffield United            36
Bournemouth                 36
West Ham United             36
Aston Villa                 36
Liverpool                   36
Manchester City             35
Manchester United           35
Chelsea                     35
Newcastle United            35
Tottenham Hotspur           35
Brighton and Hove Albion    35
Name: Team, dtype: int64

In [11]:
match_df.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2023-08-12,12:30,Premier League,Matchweek 1,Sat,Home,W,2,1,Nott'ham Forest,...,Match Report,,15.0,7.0,19.1,0.0,0,0,2024,Arsenal
2,2023-08-21,20:00,Premier League,Matchweek 2,Mon,Away,W,1,0,Crystal Palace,...,Match Report,,13.0,2.0,16.4,0.0,1,1,2024,Arsenal
3,2023-08-26,15:00,Premier League,Matchweek 3,Sat,Home,D,2,2,Fulham,...,Match Report,,18.0,9.0,13.8,0.0,1,1,2024,Arsenal
4,2023-09-03,16:30,Premier League,Matchweek 4,Sun,Home,W,3,1,Manchester Utd,...,Match Report,,17.0,5.0,15.0,0.0,0,0,2024,Arsenal
5,2023-09-17,16:30,Premier League,Matchweek 5,Sun,Away,W,1,0,Everton,...,Match Report,,13.0,4.0,17.4,0.0,0,0,2024,Arsenal


In [12]:
match_df.to_csv('../datasets/matches.csv', index=False)