# Collecting Player Dataset
In this notebook, we will
1. web scrape player data from fbref.com
2. Map Euro Fantasy Player list to FBRef player list

## 1. Scraping player table from FBRef using BeautifulSoup

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import os
from pathlib import Path

# disable pandas warnings
import warnings
warnings.filterwarnings('ignore')

In [32]:
def extract_player_data_from_p(player_tag):
    """
    Extract player data from a p tag from fbref player page
    """
    # get player name which is the text in a element
    a_elem = player_tag.find('a')
    player_name = a_elem.text
    player_url = a_elem['href']

    player_active = bool(a_elem.find_all('strong')) # player is active if name is bold

    span_elem = player_tag.find('span') # try to get country in span element
    if span_elem:
        player_country = span_elem.text.upper()
        
        # get all text in the after the span element
        text_after_span = player_tag.find_all("span")[-1].next_sibling
        player_info = text_after_span.get_text().lstrip(' \xa0·').split('\xa0· ')
    else:
        # no span element found, which means no country code
        player_country = None
        # just get all text in the p element after the player name
        other_info = a_elem.next_sibling
        player_info = other_info.get_text().strip().split('\xa0· ')
    player_info.extend([None] * (3 - len(player_info))) # ensure player_info is of length 3
    
    return player_name, player_country, player_url, player_active, *player_info

In [36]:
def url_builder_player_name(keyword):
    """ Return FBRef URL based on competition name
    """
    if len(keyword) != 2:
        raise ValueError("Keyword must be 2 characters long")
    
    scheme_domain_language_str = "https://fbref.com/en"
    url = f"{scheme_domain_language_str}/players/{keyword}/"
    return url

def get_player_by_part_of_name(keyword: str):
    """
    Get player data by first two letters of their name (usually last name)
    """    
    url = url_builder_player_name(keyword)
    raw_html = requests.get(url).text
    soup = BeautifulSoup(raw_html, "html.parser") # parse the html using BeautifulSoup
    content = soup.find("div", class_="section_content", id=re.compile("div_*")) # get the main content div
    info_lst = [extract_player_data_from_p(row) for row in content.find_all("p")] # extract player data from p tags

    COLNAMES = ['Name', 'country_code_alpha2', 'url', 'Active', 'Years Played', 'Position', 'Clubs Played']
    df = pd.DataFrame(info_lst, columns=COLNAMES)
    return df

Example usage

In [53]:
df = get_player_by_part_of_name("mb")
df.head()

Unnamed: 0,Name,country_code_alpha2,url,Active,Years Played,Position,Clubs Played
0,Joshua M'Bahia,CI,/en/players/f02067df/Joshua-MBahia,True,2023-2024,DF,Clermont Foot
1,Salem M'bakata,CD,/en/players/0c75df2a/Salem-Mbakata,True,2018-2024,DF,"Gaziantep FK, Sochaux, Aris"
2,Tatenda M'balaka,MW,/en/players/f64c1f4b/Tatenda-Mbalaka,True,2023-2026,DF,Malawi
3,Modeste M'bami,CM,/en/players/82ec2963/Modeste-Mbami,False,2000-2016,MF,"Marseille, Paris Saint-Germain, Sedan, Almería..."
4,Bryan M'Bango,,/en/players/db91d5d7/Bryan-MBango,False,2009-2010,,


Diacritics/accents in some player names make it difficult to match by default. For example, Kylian Mbappe

In [52]:
# get mbappe's data based on regex
df[df['Name'].str.contains('K.+ Mbapp.', case=False, regex=True)]

Unnamed: 0,Name,country_code_alpha2,url,Active,Years Played,Position,Clubs Played
88,Kylian Mbappé,FR,/en/players/42fd9c7f/Kylian-Mbappe,True,2015-2024,"FW,MF","Paris Saint-Germain, France, Monaco"
