# EPL Data crawler


## Player overall stats

In [None]:
teams = [
    {"name": "Liverpool", "url": "https://fbref.com/en/squads/822bd0ba/Liverpool-Stats"},
    {"name": "Arsenal", "url": "https://fbref.com/en/squads/18bb7c10/Arsenal-Stats"},
    {"name": "Nottingham Forest", "url": "https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats"},
    {"name": "Newcastle United", "url": "https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats"}
]

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

players_data = []

for team in teams:
    print(f"Scraping data for {team['name']}...")

    response = requests.get(team['url'])
    response.raise_for_status()  

    soup = BeautifulSoup(response.text, 'html.parser')


    tbody = soup.find('tbody')

    for row in tbody.find_all('tr'):
        player_data = {'team': team['name'] }

        th = row.find('th', {'data-stat': 'player'})
        if th:
            player_name = th.text.strip()
            player_url = th.find('a')['href'] if th.find('a') else None
            player_data['player_name'] = player_name
            player_data['player_url'] = f"https://fbref.com{player_url}" if player_url else None

        for td in row.find_all('td'):
            data_stat = td.get('data-stat')
            if data_stat:
                player_data[data_stat] = td.text.strip()

        if player_data.get('games') and player_data['games'].isdigit() and int(player_data['games']) > 0:
            players_data.append(player_data)

df = pd.DataFrame(players_data)

display(df)


Scraping data for Liverpool...
Scraping data for Arsenal...
Scraping data for Nottingham Forest...
Scraping data for Newcastle United...


Unnamed: 0,team,player_name,player_url,nationality,position,age,games,games_starts,minutes,minutes_90s,...,assists_per90,goals_assists_per90,goals_pens_per90,goals_assists_pens_per90,xg_per90,xg_assist_per90,xg_xg_assist_per90,npxg_per90,npxg_xg_assist_per90,matches
0,Liverpool,Virgil van Dijk,https://fbref.com/en/players/e06683ca/Virgil-v...,nl NED,DF,33-192,20,20,1800,20.0,...,0.05,0.10,0.05,0.10,0.07,0.02,0.08,0.07,0.08,Matches
1,Liverpool,Mohamed Salah,https://fbref.com/en/players/e342ad68/Mohamed-...,eg EGY,FW,32-215,20,20,1762,19.6,...,0.66,1.58,0.66,1.33,0.86,0.43,1.29,0.66,1.09,Matches
2,Liverpool,Ryan Gravenberch,https://fbref.com/en/players/b8e740fb/Ryan-Gra...,nl NED,MF,22-245,20,20,1738,19.3,...,0.10,0.10,0.00,0.10,0.04,0.10,0.14,0.04,0.14,Matches
3,Liverpool,Trent Alexander-Arnold,https://fbref.com/en/players/cd1acf9d/Trent-Al...,eng ENG,DF,26-101,19,18,1485,16.5,...,0.24,0.30,0.06,0.30,0.05,0.33,0.38,0.05,0.38,Matches
4,Liverpool,Andrew Robertson,https://fbref.com/en/players/2e4f5f03/Andrew-R...,sct SCO,DF,30-311,19,18,1444,16.0,...,0.00,0.00,0.00,0.00,0.03,0.15,0.19,0.03,0.19,Matches
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,Newcastle United,Miguel Almirón,https://fbref.com/en/players/862a1c15/Miguel-A...,py PAR,FW,30-341,8,1,154,1.7,...,0.00,0.00,0.00,0.00,0.27,0.12,0.39,0.27,0.39,Matches
90,Newcastle United,Callum Wilson,https://fbref.com/en/players/c596fcb0/Callum-W...,eng ENG,"FW,MF",32-324,4,0,61,0.7,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Matches
91,Newcastle United,William Osula,https://fbref.com/en/players/7b355808/William-...,dk DEN,"FW,MF",21-165,7,0,44,0.5,...,0.00,0.00,0.00,0.00,0.00,0.18,0.18,0.00,0.18,Matches
92,Newcastle United,Lewis Miley,https://fbref.com/en/players/2c6835e5/Lewis-Miley,eng ENG,MF,18-260,3,0,20,0.2,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Matches


In [None]:
df = df.drop(['matches'], axis=1)

df.to_csv('overall_player_stats.csv', index=False)

## Team season 24-25 stats


In [None]:
teams = [
    {"name": "Liverpool", "url": "https://fbref.com/en/squads/822bd0ba/Liverpool-Stats", "fixtures_url": "https://fbref.com/en/squads/822bd0ba/2024-2025/matchlogs/c9/schedule/Liverpool-Scores-and-Fixtures-Premier-League"},
    {"name": "Arsenal", "url": "https://fbref.com/en/squads/18bb7c10/Arsenal-Stats", "fixtures_url": "https://fbref.com/en/squads/18bb7c10/2024-2025/matchlogs/c9/schedule/Arsenal-Scores-and-Fixtures-Premier-League"},
    {"name": "Nottingham Forest", "url": "https://fbref.com/en/squads/e4a775cb/Nottingham-Forest-Stats", "fixtures_url": "https://fbref.com/en/squads/e4a775cb/2024-2025/matchlogs/c9/schedule/Nottingham-Forest-Scores-and-Fixtures-Premier-League"},
    {"name": "Newcastle United", "url": "https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats", "fixtures_url": "https://fbref.com/en/squads/b2b47a98/2024-2025/matchlogs/c9/schedule/Newcastle-United-Scores-and-Fixtures-Premier-League"}
]

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


matches_data = []

for team in teams:
    print(f"Scraping data for {team['name']}...")

    response = requests.get(team['fixtures_url'])
    response.raise_for_status() 

    soup = BeautifulSoup(response.text, 'html.parser')


    tbody = soup.find('tbody')

    for row in tbody.find_all('tr'):
        match_data = {'team': team['name'] }

        th = row.find('th', {'data-stat': 'date'})
        if th:
            match_data['match_date'] = th.text.strip()

        for td in row.find_all('td'):
            data_stat = td.get('data-stat')
            if data_stat:
                match_data[data_stat] = td.text.strip()

        if match_data.get('result') != '':
            matches_data.append(match_data)


df2 = pd.DataFrame(matches_data)

display(df2)


Scraping data for Liverpool...
{'team': 'Liverpool', 'match_date': '2024-08-17', 'start_time': '12:30', 'round': 'Matchweek 1', 'dayofweek': 'Sat', 'venue': 'Away', 'result': 'W', 'goals_for': '2', 'goals_against': '0', 'opponent': 'Ipswich Town', 'xg_for': '2.6', 'xg_against': '0.5', 'possession': '62', 'attendance': '30,014', 'captain': 'Virgil van Dijk', 'formation': '4-2-3-1', 'opp_formation': '4-2-3-1', 'referee': 'Tim Robinson', 'match_report': 'Match Report', 'notes': ''}
{'team': 'Liverpool', 'match_date': '2024-08-25', 'start_time': '16:30', 'round': 'Matchweek 2', 'dayofweek': 'Sun', 'venue': 'Home', 'result': 'W', 'goals_for': '2', 'goals_against': '0', 'opponent': 'Brentford', 'xg_for': '2.5', 'xg_against': '0.5', 'possession': '62', 'attendance': '60,017', 'captain': 'Virgil van Dijk', 'formation': '4-2-3-1', 'opp_formation': '4-4-2', 'referee': 'Stuart Attwell', 'match_report': 'Match Report', 'notes': ''}
{'team': 'Liverpool', 'match_date': '2024-09-01', 'start_time': '1

Unnamed: 0,team,match_date,start_time,round,dayofweek,venue,result,goals_for,goals_against,opponent,xg_for,xg_against,possession,attendance,captain,formation,opp_formation,referee,match_report,notes
0,Liverpool,2024-08-17,12:30,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,2.6,0.5,62,30014,Virgil van Dijk,4-2-3-1,4-2-3-1,Tim Robinson,Match Report,
1,Liverpool,2024-08-25,16:30,Matchweek 2,Sun,Home,W,2,0,Brentford,2.5,0.5,62,60017,Virgil van Dijk,4-2-3-1,4-4-2,Stuart Attwell,Match Report,
2,Liverpool,2024-09-01,16:00,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,1.8,1.4,47,73738,Virgil van Dijk,4-2-3-1,4-2-3-1,Anthony Taylor,Match Report,
3,Liverpool,2024-09-14,15:00,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,0.9,0.4,68,60344,Virgil van Dijk,4-2-3-1,4-2-3-1,Michael Oliver,Match Report,
4,Liverpool,2024-09-21,15:00,Matchweek 5,Sat,Home,W,3,0,Bournemouth,2.0,1.1,58,60347,Virgil van Dijk,4-2-3-1,4-2-3-1,Tony Harrington,Match Report,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,Newcastle United,2024-12-21,15:00,Matchweek 17,Sat,Away,W,4,0,Ipswich Town,2.3,0.7,58,29774,Bruno Guimarães,4-3-3,4-2-3-1,Stuart Attwell,Match Report,
79,Newcastle United,2024-12-26,15:00,Matchweek 18,Thu,Home,W,3,0,Aston Villa,2.7,0.3,62,52168,Bruno Guimarães,4-3-3,4-2-3-1,Anthony Taylor,Match Report,
80,Newcastle United,2024-12-30,20:00,Matchweek 19,Mon,Away,W,2,0,Manchester Utd,1.9,0.8,48,73809,Bruno Guimarães,4-3-3,3-4-3,Simon Hooper,Match Report,
81,Newcastle United,2025-01-04,12:30,Matchweek 20,Sat,Away,W,2,1,Tottenham,2.5,0.9,44,61293,Bruno Guimarães,4-3-3,4-2-3-1,Andy Madley,Match Report,


In [None]:
df2.to_csv('match_summary.csv', index=False)