##Web Scraping data about matches from The EPL with Python

In this notebook I will load Premier League match data from their web page using the Beautiful Soup library and its methods to further work with the analysis of this dataframe and predictions. 

1.  Parsing data using Beautiful soup 
2.  Cleaning data with pandas for machine learning
1.  Saving completed dataframe

In [53]:
#import python library

import requests

In [54]:
#Founding the page with data

standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [55]:
#Downloading data

data = requests.get(standings_url)

In [56]:
#Looking for HTML file

data.text



In [57]:
#Installing library for parsing HTML data

from bs4 import BeautifulSoup

In [58]:
#Here I got stats from the HTML page using library selectors and methods

soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [59]:
#Posting a link before the HTML document part

team_urls = [f"https://fbref.com{l}" for l in links]

In [60]:
#Getting the data but not in essential format

data = requests.get(team_urls[0])

In [61]:
#Using pandas I can read HTML and get exactly what i want from the page that has string mentioned in code

import pandas as pd
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [62]:
#Getting number of shots, the number of shots on target, number of penalty and etc what can be helpful 

soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [63]:
#Getting mentioned data

data = requests.get(f"https://fbref.com{links[0]}")

In [64]:
#Read in shooting stats

shooting = pd.read_html(data.text, match="Shooting")[0]

In [65]:
#Look on shootings

shooting.head()

Unnamed: 0_level_0,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,For Manchester City,...,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2022-07-30,17:00,Community Shield,FA Community Shield,Sat,Neutral,L,1,3,Liverpool,...,,,0,0,,,,,,Match Report
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2,0,West Ham,...,18.2,1.0,1,1,2.3,1.5,0.11,-0.3,-0.5,Match Report
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,0,Bournemouth,...,15.6,0.0,0,0,1.6,1.6,0.08,1.4,1.4,Match Report
3,,,,,,,2-0-1,7,3,,...,16.6,1.0,1,1,3.9,3.1,0.09,2.1,1.9,


In [66]:
#Drop one index level

shooting.columns = shooting.columns.droplevel()

In [67]:
#Merging dataframes above together 

team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")

In [68]:
#Check

team_data

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2022-07-30,17:00,Community Shield,FA Community Shield,Sat,Neutral,L,1.0,3.0,Liverpool,...,4-3-3,Craig Pawson,Match Report,,14,8,,,0,0
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,4-3-3,Michael Oliver,Match Report,,13,1,18.2,1.0,1,1
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,4-3-3,David Coote,Match Report,,20,7,15.6,0.0,0,0


In [69]:
#Scrape data for multiplie teamas and multiplie years 

years = list(range(2022, 2020, -1))
all_matches = []

In [70]:
years

[2022, 2021]

In [71]:
#The URL from the beginning where all work started

standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [73]:
#Copying some code and initialazing a list called 'matches' which will contain several data frames and each dataframe is going to contain the match logs for one team in one season

import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
#Individually scrape the match logs for each team

for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[1]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]
        
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [74]:
len(all_matches)

20

In [75]:
#Combine all individual dataframes into one data

match_df = pd.concat(all_matches)

In [76]:
#Lowercase all of the columns

match_df.columns = [c.lower() for c in match_df.columns]

In [77]:
#Look on the dataframe

match_df

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2020-09-21,20:15,Premier League,Matchweek 2,Mon,Away,W,3,1,Wolves,...,Match Report,,10.0,1.0,16.7,0.0,0.0,0.0,2021,Manchester City
2,2020-09-27,16:30,Premier League,Matchweek 3,Sun,Home,L,2,5,Leicester City,...,Match Report,,4.0,4.0,27.5,0.0,3.0,3.0,2021,Manchester City
4,2020-10-03,17:30,Premier League,Matchweek 4,Sat,Away,D,1,1,Leeds United,...,Match Report,,12.0,7.0,11.6,0.0,0.0,0.0,2021,Manchester City
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1,0,Arsenal,...,Match Report,,11.0,3.0,18.2,2.0,0.0,0.0,2021,Manchester City
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1,1,West Ham,...,Match Report,,6.0,2.0,20.6,0.0,0.0,0.0,2021,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,20.0,10.0,18.7,2.0,0.0,0.0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,21.0,8.0,15.6,1.0,0.0,0.0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,16.0,6.0,15.0,0.0,0.0,0.0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,16.0,5.0,17.1,1.0,0.0,0.0,2021,Sheffield United


In [78]:
#Saving dataframe to matches.csv

match_df.to_csv("matches.csv")