### Web Scraping 6 seasons of La Liga match history (fbref.com)
#### 2022/2023
#### 2021/2022
#### 2020/2021
#### 2019/2020
#### 2018/2019
#### 2017/2018

In [6]:
import requests 
import html5lib
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


In [14]:
years = list(range(2023, 2017, -1))

years

[2023, 2022, 2021, 2020, 2019, 2018]

In [8]:
all_matches = []     #list of data frames
standings_url = "https://fbref.com/en/comps/12/La-Liga-Stats"

import time

for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]     #grab the league table. match data in here

    links = [l.get("href") for l in standings_table.find_all('a')]      #grab the match links
    links = [l for l in links if '/squads/' in l]    #filter match links to grab only those with squads
    team_urls = [f"https://fbref.com{l}" for l in links] #forming absolute path

    previous_season = soup.select("a.prev")[0].get('href')  #grab anchor tags with class prev and get the href property of the anchor tag
    standings_url = f"https://fbref.com/{previous_season}"   

    #looping through each team url and scrap the match logs for each team.
    # make sure to set the team name and year properly
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ") #getting team name

        data = requests.get(team_url) #grabbing scores and fixtures table
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        #pull all comps shooting list
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        
        #when the shooting stats is empty for a team. for some reason some teams don't have shooting stats.
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date" )
        except ValueError:
            continue

        team_data = team_data[team_data["Comp"] == "La Liga"] #filter for la liga only.
        #on the site we can tell which season and which team we're viewing so let's preserve that info in our scraping data
        team_data["Season"]  = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        #we don't want to scrape quickly as we're scraping
        time.sleep(10)   


In [9]:
match_df  = pd.concat(all_matches)

#make all column names lowercase for easy tpying
match_df.columns = [c.lower() for c in match_df.columns]

#write our df to csv file
match_df.to_csv("matches_two.csv")

In [11]:
matches = pd.read_csv('matches_two.csv')
matches.head()

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,0,2022-08-13,21:00,La Liga,Matchweek 1,Sat,Home,D,0.0,0.0,...,Match Report,,21.0,5.0,17.0,1.0,0.0,0.0,2023,Barcelona
1,1,2022-08-21,22:00,La Liga,Matchweek 2,Sun,Away,W,4.0,1.0,...,Match Report,,15.0,7.0,14.6,0.0,0.0,0.0,2023,Barcelona
2,2,2022-08-28,19:30,La Liga,Matchweek 3,Sun,Home,W,4.0,0.0,...,Match Report,,24.0,9.0,14.4,1.0,0.0,0.0,2023,Barcelona
3,3,2022-09-03,21:00,La Liga,Matchweek 4,Sat,Away,W,3.0,0.0,...,Match Report,,18.0,5.0,16.0,2.0,0.0,0.0,2023,Barcelona
4,5,2022-09-10,18:30,La Liga,Matchweek 5,Sat,Away,W,4.0,0.0,...,Match Report,,16.0,8.0,14.9,0.0,0.0,0.0,2023,Barcelona


In [12]:
matches.shape

(4322, 28)

###  Feature Labels
##### <br>date - > Date match was played <br>
##### time -> Time match was played <br>
##### comp -> name of competition (teams compete for different trophies during the season)<br>
##### round -> match week. label for specific stages in the competition. eg match week one is when each team plays their first game, match week two is second game and so on <br>
##### day -> the day of the week the match is played <br>
##### venue -> did the team host the visitors(HOME) or they were visitors(AWAY) <br>
##### result -> outcome of the match. was it a Win(W), Draw(D) or Loss(L)<br>
##### gf -> Goals for. How many goals did the the team score<br>
##### ga -> Goals Against. How many goals were scored against the team <br>
##### oppenent -> the team they competed with <br>
##### xg - > expected goals(includes penalty kicks but excludes penalty shootouts). it's a metric for the probability of a shot resulting in a goal kind of like goals a tem is expected to have scored based on the quality and quantity of shots. since soccer is a low sco <br>
##### xgA -> expected goals allowed(includes penalty kicks but do not include penalty shootouts). The number of goals a team is expected to have conceded based on the quality and quantity of shots they have taken . <br>
##### poss -> Possession. an indicator of who encountered the ball more. calculated as a percentage of passes attempted <br>
##### attendance -> how many spectators were present at the game. <br>
##### captain -> team captain <br>
##### formation -> Number of players from defenders to forwards not including goal keeper. eg 4-3-3 means 4 defenders, 3 midfielders and 3 forwards. <br>
##### feferee -> name of referee that handled the game. <br>
##### match report -> link to a visual representation of the same statistics above. we ignored it in our Dataframe <br>
##### notes -> any other descriptive statistics relating to the match eg. was extra time required. most rows are empty for this. for the purpose of our work, this is not required. <br>
##### sh -> total shots made during the game. doesn't include penalty kicks. <br>
##### sot -> Shots on target ie of the total shots made, how many were directed within the goal posts. <br>
##### dist - > Average shot distance in yards from goal of all shots taken. Doesn't include penalty kicks. <br>
##### fk -> Shots from free kicks. <br>
##### pk -> Penalty kicks made or converted. <br>
##### pkatt -> Penalty kicks attempted.<br>
##### season -> indicator of which season the match was played. We should have 2018- 2023. <br>
##### team -> indicates the team the statistics belongs to. <br>