In [1]:
####### THIS SCRIPT WAS AUTHORED BY: Anirudh Ashok
####### Email: anirudh.ash2594@gmail.com
####### Please email me for suggestions or concerns regarding my work.
####### Collaboration is appreciated.
####### 
####### This notebook is to scrape football data from fbref.com
#######




In [2]:
#Importing Date and Time handling:
import datetime
from datetime import date
import time


#Web Scraping and System functions:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import sys, getopt
import csv
import os
import uuid


#Data Visualization:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')
import missingno as msno


#Progress Bar:
from tqdm import tqdm


#Ignore Warnings:
import warnings
warnings.filterwarnings(action="ignore")

In [3]:
dir_path = '../../Soccer Analysis/data/fbref'

In [4]:
### We want to focus on MLS for this first phase
## Specifying league name and id for scraping from FBRef

league_names_map = {'Major-League-Soccer' : '22'}#, 'Premier-League' : '9'}


### Specifying list of leagues to search
league_names_list = ['Major-League-Soccer']


### Seasons we require for this study
seasons_list = ['2019-2020', '2020-2021', '2021-2022', '2022-2023']



In [5]:
## Define list of variables
####### 
####### PLEASE NOTE:This scraping code has been inspired by: https://github.com/parth1902
####### 
#standard(stats)
stats = ["player","nationality","position","squad","age","birth_year","games","games_starts","minutes","goals","assists","pens_made","pens_att","cards_yellow","cards_red","goals_per90","assists_per90","goals_assists_per90","goals_pens_per90","goals_assists_pens_per90","xg","npxg","xa","xg_per90","xa_per90","xg_xa_per90","npxg_per90","npxg_xa_per90"]
stats3 = ["players_used","possession","games","games_starts","minutes","goals","assists","pens_made","pens_att","cards_yellow","cards_red","goals_per90","assists_per90","goals_assists_per90","goals_pens_per90","goals_assists_pens_per90","xg","npxg","xa","xg_per90","xa_per90","xg_xa_per90","npxg_per90","npxg_xa_per90"] 
#goalkeeping(keepers)
keepers = ["player","nationality","position","squad","age","birth_year","games_gk","games_starts_gk","minutes_gk","goals_against_gk","goals_against_per90_gk","shots_on_target_against","saves","save_pct","wins_gk","draws_gk","losses_gk","clean_sheets","clean_sheets_pct","pens_att_gk","pens_allowed","pens_saved","pens_missed_gk"]
keepers3 = ["players_used","games_gk","games_starts_gk","minutes_gk","goals_against_gk","goals_against_per90_gk","shots_on_target_against","saves","save_pct","wins_gk","draws_gk","losses_gk","clean_sheets","clean_sheets_pct","pens_att_gk","pens_allowed","pens_saved","pens_missed_gk"]
#advance goalkeeping(keepersadv)
keepersadv = ["player","nationality","position","squad","age","birth_year","minutes_90s","goals_against_gk","pens_allowed","free_kick_goals_against_gk","corner_kick_goals_against_gk","own_goals_against_gk","psxg_gk","psnpxg_per_shot_on_target_against","psxg_net_gk","psxg_net_per90_gk","passes_completed_launched_gk","passes_launched_gk","passes_pct_launched_gk","passes_gk","passes_throws_gk","pct_passes_launched_gk","passes_length_avg_gk","goal_kicks","pct_goal_kicks_launched","goal_kick_length_avg","crosses_gk","crosses_stopped_gk","crosses_stopped_pct_gk","def_actions_outside_pen_area_gk","def_actions_outside_pen_area_per90_gk","avg_distance_def_actions_gk"]
keepersadv2 = ["minutes_90s","goals_against_gk","pens_allowed","free_kick_goals_against_gk","corner_kick_goals_against_gk","own_goals_against_gk","psxg_gk","psnpxg_per_shot_on_target_against","psxg_net_gk","psxg_net_per90_gk","passes_completed_launched_gk","passes_launched_gk","passes_pct_launched_gk","passes_gk","passes_throws_gk","pct_passes_launched_gk","passes_length_avg_gk","goal_kicks","pct_goal_kicks_launched","goal_kick_length_avg","crosses_gk","crosses_stopped_gk","crosses_stopped_pct_gk","def_actions_outside_pen_area_gk","def_actions_outside_pen_area_per90_gk","avg_distance_def_actions_gk"]
#shooting(shooting)
shooting = ["player","nationality","position","squad","age","birth_year","minutes_90s","goals","pens_made","pens_att","shots_total","shots_on_target","shots_free_kicks","shots_on_target_pct","shots_total_per90","shots_on_target_per90","goals_per_shot","goals_per_shot_on_target","xg","npxg","npxg_per_shot","xg_net","npxg_net"]
shooting2 = ["minutes_90s","goals","pens_made","pens_att","shots_total","shots_on_target","shots_free_kicks","shots_on_target_pct","shots_total_per90","shots_on_target_per90","goals_per_shot","goals_per_shot_on_target","xg","npxg","npxg_per_shot","xg_net","npxg_net"]
shooting3 = ["goals","pens_made","pens_att","shots_total","shots_on_target","shots_free_kicks","shots_on_target_pct","shots_total_per90","shots_on_target_per90","goals_per_shot","goals_per_shot_on_target","xg","npxg","npxg_per_shot","xg_net","npxg_net"]
#passing(passing)
passing = ["player","nationality","position","squad","age","birth_year","minutes_90s","passes_completed","passes","passes_pct","passes_total_distance","passes_progressive_distance","passes_completed_short","passes_short","passes_pct_short","passes_completed_medium","passes_medium","passes_pct_medium","passes_completed_long","passes_long","passes_pct_long","assists","xa","xa_net","assisted_shots","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","progressive_passes"]
passing2 = ["passes_completed","passes","passes_pct","passes_total_distance","passes_progressive_distance","passes_completed_short","passes_short","passes_pct_short","passes_completed_medium","passes_medium","passes_pct_medium","passes_completed_long","passes_long","passes_pct_long","assists","xa","xa_net","assisted_shots","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","progressive_passes"]
#passtypes(passing_types)
passing_types = ["player","nationality","position","squad","age","birth_year","minutes_90s","passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_pressure","passes_switches","crosses","corner_kicks","corner_kicks_in","corner_kicks_out","corner_kicks_straight","passes_ground","passes_low","passes_high","passes_left_foot","passes_right_foot","passes_head","throw_ins","passes_other_body","passes_completed","passes_offsides","passes_oob","passes_intercepted","passes_blocked"]
passing_types2 = ["passes","passes_live","passes_dead","passes_free_kicks","through_balls","passes_pressure","passes_switches","crosses","corner_kicks","corner_kicks_in","corner_kicks_out","corner_kicks_straight","passes_ground","passes_low","passes_high","passes_left_foot","passes_right_foot","passes_head","throw_ins","passes_other_body","passes_completed","passes_offsides","passes_oob","passes_intercepted","passes_blocked"]
#goal and shot creation(gca)
gca = ["player","nationality","position","squad","age","birth_year","minutes_90s","sca","sca_per90","sca_passes_live","sca_passes_dead","sca_dribbles","sca_shots","sca_fouled","gca","gca_per90","gca_passes_live","gca_passes_dead","gca_dribbles","gca_shots","gca_fouled","gca_defense"]
gca2 = ["sca","sca_per90","sca_passes_live","sca_passes_dead","sca_dribbles","sca_shots","sca_fouled","gca","gca_per90","gca_passes_live","gca_passes_dead","gca_dribbles","gca_shots","gca_fouled","gca_defense"]
#defensive actions(defense)
defense = ["player","nationality","position","squad","age","birth_year","minutes_90s","tackles","tackles_won","tackles_def_3rd","tackles_mid_3rd","tackles_att_3rd","dribble_tackles","dribbles_vs","dribble_tackles_pct","dribbled_past","pressures","pressure_regains","pressure_regain_pct","pressures_def_3rd","pressures_mid_3rd","pressures_att_3rd","blocks","blocked_shots","blocked_shots_saves","blocked_passes","interceptions","clearances","errors"]
defense2 = ["tackles","tackles_won","tackles_def_3rd","tackles_mid_3rd","tackles_att_3rd","dribble_tackles","dribbles_vs","dribble_tackles_pct","dribbled_past","pressures","pressure_regains","pressure_regain_pct","pressures_def_3rd","pressures_mid_3rd","pressures_att_3rd","blocks","blocked_shots","blocked_shots_saves","blocked_passes","interceptions","clearances","errors"]
#possession(possession)
possession = ["player","nationality","position","squad","age","birth_year","minutes_90s","touches","touches_def_pen_area","touches_def_3rd","touches_mid_3rd","touches_att_3rd","touches_att_pen_area","touches_live_ball","dribbles_completed","dribbles","dribbles_completed_pct","players_dribbled_past","nutmegs","carries","carry_distance","carry_progressive_distance","progressive_carries","carries_into_final_third","carries_into_penalty_area","pass_targets","passes_received","passes_received_pct","miscontrols","dispossessed"]
possession2 = ["touches","touches_def_pen_area","touches_def_3rd","touches_mid_3rd","touches_att_3rd","touches_att_pen_area","touches_live_ball","dribbles_completed","dribbles","dribbles_completed_pct","players_dribbled_past","nutmegs","carries","carry_distance","carry_progressive_distance","progressive_carries","carries_into_final_third","carries_into_penalty_area","pass_targets","passes_received","passes_received_pct","miscontrols","dispossessed"]
#playingtime(playingtime)
playingtime = ["player","nationality","position","squad","age","birth_year","minutes_90s","games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs","minutes_per_sub","unused_subs","points_per_match","on_goals_for","on_goals_against","plus_minus","plus_minus_per90","plus_minus_wowy","on_xg_for","on_xg_against","xg_plus_minus","xg_plus_minus_per90","xg_plus_minus_wowy"]
playingtime2 = ["games","minutes","minutes_per_game","minutes_pct","games_starts","minutes_per_start","games_subs","minutes_per_sub","unused_subs","points_per_match","on_goals_for","on_goals_against","plus_minus","plus_minus_per90","plus_minus_wowy","on_xg_for","on_xg_against","xg_plus_minus","xg_plus_minus_per90","xg_plus_minus_wowy"]
#miscallaneous(misc)
misc = ["player","nationality","position","squad","age","birth_year","minutes_90s","cards_yellow","cards_red","cards_yellow_red","fouls","fouled","offsides","crosses","interceptions","tackles_won","pens_won","pens_conceded","own_goals","ball_recoveries","aerials_won","aerials_lost","aerials_won_pct"]
misc2 = ["cards_yellow","cards_red","cards_yellow_red","fouls","fouled","offsides","crosses","interceptions","tackles_won","pens_won","pens_conceded","own_goals","ball_recoveries","aerials_won","aerials_lost","aerials_won_pct"]

In [6]:
## Function to get urls to be scraped into a list

def get_data_links(top, end):
    link1,cat1 = frame_for_category('stats',top,end,stats) ## cat1 returns stats
    link2,cat2 = frame_for_category('shooting',top,end,shooting2)
    link3,cat3 = frame_for_category('passing',top,end,passing2)
    link4,cat4 = frame_for_category('passing_types',top,end,passing_types2)
    link5,cat5 = frame_for_category('gca',top,end,gca2)
    link6,cat6 = frame_for_category('defense',top,end,defense2)
    link7,cat7 = frame_for_category('possession',top,end,possession2)
    link8,cat8 = frame_for_category('misc',top,end,misc2)
    links = [link1, link2, link3, link4, link5, link6, link7, link8]
    categories = ['standard', cat2, cat3, cat4, cat5, cat6, cat7, cat8] ## Replacing cat1 with standard
    
    return links, categories

def frame_for_category(category,top,end,features):
    url = (top + category + end)
    #print(url)
    return url, category

print("Click URLs to verify if they are functional")
df_outfield_url, categories = get_data_links("https://fbref.com/en/comps/22/","/Major-League-Soccer-Stats")
df_outfield_url, categories

Click URLs to verify if they are functional


(['https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats',
  'https://fbref.com/en/comps/22/shooting/Major-League-Soccer-Stats',
  'https://fbref.com/en/comps/22/passing/Major-League-Soccer-Stats',
  'https://fbref.com/en/comps/22/passing_types/Major-League-Soccer-Stats',
  'https://fbref.com/en/comps/22/gca/Major-League-Soccer-Stats',
  'https://fbref.com/en/comps/22/defense/Major-League-Soccer-Stats',
  'https://fbref.com/en/comps/22/possession/Major-League-Soccer-Stats',
  'https://fbref.com/en/comps/22/misc/Major-League-Soccer-Stats'],
 ['standard',
  'shooting',
  'passing',
  'passing_types',
  'gca',
  'defense',
  'possession',
  'misc'])

In [7]:
url = "https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats"

In [8]:
### Update 12/14/2023: Player stats tables are commented out in the backend.
## We can see them on the website but we cannot retrieve them as tables.

def get_team_tables(url_list,categories, save_folder):
    
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    
    # Fetch the HTML content of the webpages
    for url in url_list:
        print(f"Accessing URL: {url}")
        response = requests.get(url)
        html_content = response.content
        
        try:
            # Parse the HTML and extract tables
            tables = pd.read_html(html_content)
            print(f"Number of tables found: {len(tables)}")
        
            for i, table in enumerate(tables):
                print(f"\nTable {i+1}:\n")
                print(table.head(5))
                
                # Create a unique filename for each table
                #filename = 
                file_path = os.path.join(save_folder, f"team_stats_{uuid.uuid4()}_table_{i}.csv")
                
                table.to_csv(file_path, index=False)
                print(f"Table {i+1} is saved as {file_path}")
                
        except ValueError:
            print(f"Number of tables found: {len(tables)}")
        print("\n" + "-"*192 + "\n")
        
        time.sleep(5) ## Adding delays betweens requests to server to avoid overwhelming server

In [9]:
get_team_tables(df_outfield_url,categories, dir_path)

Accessing URL: https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats
Number of tables found: 2

Table 1:

  Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0 Unnamed: 3_level_0  \
               Squad               # Pl                Age               Poss   
0        Atlanta Utd                 33               25.8               55.6   
1             Austin                 26               27.9               51.2   
2        CF Montréal                 31               24.4               48.3   
3          Charlotte                 32               27.2               52.7   
4       Chicago Fire                 26               26.5               46.1   

  Playing Time                    Performance      ... Per 90 Minutes        \
            MP Starts   Min   90s         Gls Ast  ...            Gls   Ast   
0           34    374  3060  34.0          64  44  ...           1.88  1.29   
1           34    374  3060  34.0          48  37  ...           1.41  1.09   
2 

In [10]:
def get_player_tables(url_list, categories, save_folder):
    
    # Fetch the HTML content of the webpages
    for url in url_list:
        print(f"Accessing URL: {url}")
        response = requests.get(url)
        html_content = response.content
        
        for category in categories:
            table_id = f'stats_{category}'
            try:
                ##### FIX: for Player Table Problem
                ##### Reference
                ##### https://stackoverflow.com/questions/76911108/web-scraping-a-table-into-a-pandas-dataframe-from-fbref-com

                player_table = pd.read_html(response.text.replace('<!--','').replace('-->',''),attrs={'id': table_id})[0]
                print(player_table)
                
                file_path = os.path.join(save_folder, f"player_stats_{uuid.uuid4()}.csv" )
                player_table.to_csv(file_path, index=False)
            
            except ValueError:
                continue
            
        print("\n" + "-"*192 + "\n")
        
        time.sleep(5)

In [11]:
get_player_tables(df_outfield_url, categories, dir_path)

Accessing URL: https://fbref.com/en/comps/22/stats/Major-League-Soccer-Stats
    Unnamed: 0_level_0 Unnamed: 1_level_0 Unnamed: 2_level_0  \
                    Rk             Player             Nation   
0                    1         Luis Abram             pe PER   
1                    2     Lalas Abubakar             gh GHA   
2                    3      Daniel Aceves             mx MEX   
3                    4       Bryan Acosta             hn HON   
4                    5       Bryan Acosta             hn HON   
..                 ...                ...                ...   
884                851        Ethan Zubak             us USA   
885                852      Dario Župarić             ba BIH   
886                853        Graham Zusi             us USA   
887                854    Nökkvi Þórisson             is ISL   
888                855  Róbert Þorkelsson             is ISL   

    Unnamed: 3_level_0 Unnamed: 4_level_0 Unnamed: 5_level_0  \
                   Pos    

In [12]:
### THIS SCRIPT SCRAPES THE MOST RECENT LEAGUE TABLES ###