From UEFA's website terms and conditions:

6.2 All Content provided by UEFA is owned by, or licensed to, UEFA. Content provided by UEFA is provided to you "AS IS" and may not be used, reproduced, distributed, transmitted, broadcast, displayed, sold, licensed or otherwise exploited for any other purposes than your personal access and viewing of the Content on the UEFA Platforms. For that sole and exclusive purpose, UEFA grants to you a limited, revocable, non-exclusive license to access and use the UEFA Platforms privately for non-commercial purposes, in accordance with these Terms & Conditions.

Author: Carolina Cornejo Castellano

Requisites: Install selenium, pandas and numpy. Update Google Chrome to the latest version.


This scraper takes +-6hours to run

In [5]:
# 1: import libraries -----------------------------------------------------
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import pandas as pd
import numpy as np
import re
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--start-maximized")

In [6]:
# 2: enter website, reject cookies and define function to scroll ----------
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get("https://www.uefa.com/european-qualifiers/statistics/players/")
time.sleep(2)
reject_cookies = driver.find_element(By.CSS_SELECTOR, '#onetrust-reject-all-handler')
reject_cookies.click()

# define function to scroll down, in order to make rest of the table discoverable
def scroll_to_sponsors():
    try:
        sponsors_xpath = "//div[contains(text(), 'Official global sponsors')]"
        select_sponsors_banner = driver.find_element(By.XPATH, sponsors_xpath)
        ActionChains(driver)\
            .scroll_to_element(select_sponsors_banner)\
            .perform()
        time.sleep(2)
    except:
        try:
            sponsors_xpath = '//div[@class="pk-container pk-bg--background lazyloaded" and @role="region" and @pk-theme="light" and @aria-label=""]'
            select_sponsors_banner = driver.find_element(By.XPATH, sponsors_xpath)
            ActionChains(driver)\
                .scroll_to_element(select_sponsors_banner)\
                .perform()
            time.sleep(2)
        except: 
            print("The scroll_to_sponsors function didn't work.")
            time.sleep(2)
    

# will need these later:
dataset = pd.DataFrame()
name, national_team, club, overview_figures, overview_labels, stats_figures, stats_labels = [], [], [], [], [], [], []

In [7]:
# 3: define main functions ------------------------------------------------
def extract_name_and_teams():
    try:
        name = driver.find_element(By.CSS_SELECTOR, '.player-header__name--first').text + ' ' + driver.find_element(By.CSS_SELECTOR, '.player-header__name--last').text
        print(name)
    except:
        print('Player name and/or last name not found.')

    try:
        # national_team = driver.find_element(By.CSS_SELECTOR, '.player-header__teams > div:nth-child(1) > a:nth-child(3) > pk-identifier:nth-child(1) > div:nth-child(2) > span:nth-child(1)').text
        # print(national_team)
        national_team = driver.find_element(By.XPATH, '//span[@class="player-header__team-name pk-text--text-01"][1]').text
        print(national_team)
    except:
        print('National team not found.')

    try:
        club = driver.find_element(By.CSS_SELECTOR, '.player-header__teams > div:nth-child(2) > a:nth-child(3) > pk-identifier:nth-child(1) > div:nth-child(2) > span:nth-child(1)').text
        print(club)
    except:
        try:
            print('Club not found. Trying another xpath...')
            club = driver.find_element(By.XPATH, '/html/body/div[3]/div/div/div[2]/div[3]/div[2]/div[2]/pk-identifier/div/div[1]/div[2]/pk-identifier/div/span').text
            print(club)
        except:
            print('Club not found.')
            club = "" 

    return name, national_team, club

def extract_list_with_xpath(labels_xpath):
    if driver.find_elements(By.XPATH, "//h2[contains(text(), 'Qualifying stats')]"):
        labels_text = []
        print('Player only participated in qualifyings.')
    else:
        all_labels = driver.find_elements(By.XPATH, labels_xpath)
        labels_text = [label.text for label in all_labels]
    return labels_text

def open_accordions():
    for i in range(0, 6):
        try: 
            driver.find_element(By.CSS_SELECTOR, '#accordion-item-' + str(i) + ' > pk-accordion-item-title:nth-child(1) > h2:nth-child(1)').click()
            time.sleep(2)
        except: 
            print('There are no more accordions to open.')
            
def update_dataset(dataset):
    player_info = [name, national_team, club] + overview_figures
    if stats_figures:
        player_info += stats_figures
    else:
        player_info += [""] * len(stats_figures)

    if stats_labels:
        columns = ["name", "national_team", "club"] + overview_labels + stats_labels
    else:
        columns = ["name", "national_team", "club"] + overview_labels

    new_row = pd.DataFrame([player_info], columns=columns)
    new_row = new_row.loc[:,~new_row.columns.duplicated()].copy()

    dataset = dataset.loc[:,~dataset.columns.duplicated()].copy() 

    if dataset.empty:
        print("Condition met: dataset.empty")
    elif len(dataset.columns) == len(columns):
        print("Condition met: len(dataset.columns) == len(columns)")
        try:
            dataset = dataset[columns]
        except:
            print('Something went wrong when executing the if statement of the len(dataset.columns) == len(columns) condition in case of ' + name + ' from ' + national_team)
    elif len(dataset.columns) < len(columns):
        print("Condition met: len(dataset.columns) < len(columns)")
        try: 
            for column in columns:
                if column not in dataset.columns:
                    dataset[column] = ""
                else:
                    continue
            dataset = dataset[new_row.columns]
        except:
            print('Something went wrong when executing the if statement of the len(dataset.columns) < len(columns) condition in case of ' + name + ' from ' + national_team)
    elif len(dataset.columns) > len(columns):
        print("Condition met: len(dataset.columns) > len(columns)")
        try: 
            for column in dataset.columns:
                if column not in columns:
                    new_row[column] = ""
                else:
                    continue
            new_row = new_row[dataset.columns]
        except:
            print('Something went wrong when executing the if statement of the len(dataset.columns) > len(columns) condition in case of ' + name + ' from ' + national_team)
    else:
        print('Something went wrong when executing update_dataset function.')
    
    dataset = pd.concat([dataset, new_row], ignore_index=True)

    return dataset

In [8]:
# 4: scraping work: iterate over each country and each player -------------
for i in range(2, 55): # for i in range(2, 55) 2024-06-26_11-41
    # open main site
    driver.get("https://www.uefa.com/european-qualifiers/statistics/players/")
    # select main tournament (excluding qualiying)
    time.sleep(2)
    try:
        main_tournament = driver.find_element(By.XPATH, '//pk-accordion-item[1]/pk-accordion-item-content/pk-radio/pk-radio-option[1]')
        time.sleep(2)
        main_tournament.click()
        time.sleep(2)
    except:
        main_tournament = driver.find_element(By.XPATH, '//input[@class="pk-radio" and @name="phase" and @title="phase" and @type="radio" and @id="tournament" and @value="TOURNAMENT" and @part="input"]')
        time.sleep(2)
        main_tournament.click()
        time.sleep(2)
    # select country
    xpath_country = '//pk-accordion-item[2]/pk-accordion-item-content/div/pk-radio/pk-radio-option[' + str(i) + ']/span'
    select_country = driver.find_element(By.XPATH, xpath_country)
    country_name = select_country.text
    print('---------- Accessing info of ' + country_name + ' ----------')
    time.sleep(2)
    select_country.click() # select country
    time.sleep(3)
    # scroll down twice, as needed to discover the whole page
    scroll_to_sponsors()
    scroll_to_sponsors()
    # gather all players' stats website links
    try: 
        player_xpath = '//a[contains(@class, "pk-w--100") and contains(@href, "/api/v1/linkrules/player/") and contains(@href, "/statistics?competitionId=3&phase=TOURNAMENT")]'
        select_player = driver.find_elements(By.XPATH, player_xpath)
    except:
        print('Seems like the site of ' + country_name + ' is empty.')
        continue
    players_ids = []
    for link in select_player:
        href = link.get_attribute('href')
        player_id = re.search(r'player/(\d+)/', href).group(1)
        players_ids.append(player_id)
    for player_id in players_ids:
        time.sleep(2)
        driver.get('https://www.uefa.com/euro2024/teams/players/' + player_id + '/')
        print('---------- Working on player whose ID is ' + player_id + ' -------------')
        time.sleep(2)
        name, national_team, club = extract_name_and_teams()
        overview_labels = extract_list_with_xpath('//span[@class="player-profile-category"]')
        overview_figures = extract_list_with_xpath('//span[@class="player-profile-value"]')
        print(overview_labels)
        print(overview_figures)
        time.sleep(2)
        driver.get('https://www.uefa.com/euro2024/teams/players/' + player_id + '/statistics/')
        time.sleep(2)
        scroll_to_sponsors()
        open_accordions()
        stats_labels = extract_list_with_xpath('//div[@slot="stat-label"]')
        stats_figures = extract_list_with_xpath('//div[@slot="stat-value"]')
        print(stats_labels)
        print(stats_figures)
        dataset = update_dataset(dataset)
        print('---------- End of process for player whose ID is ' + player_id + ' ----------')
        time.sleep(2)

---------- Accessing info of Albania ----------
---------- Working on player whose ID is 250127042 -------------
MARIO MITAJ
Albania
Lokomotiv Moskva
['POSITION', 'CLUB NUMBER', 'NATIONAL TEAM NUMBER', 'COUNTRY OF BIRTH', 'DATE OF BIRTH']
['Defender', '40', '3', 'Greece', '06/8/2003 (20)']
There are no more accordions to open.
There are no more accordions to open.
['Matches played', 'Minutes played', 'Goals', 'Tackles', 'Balls recovered', 'Passing accuracy (%)', 'Top speed (km/h)', 'Distance covered (km)', 'Yellow cards', 'Red cards', 'Balls recovered', 'Blocks', 'Own goals conceded', 'Penalties conceded', 'Clearances completed', 'Attempts conceded on target', 'Passes completed', 'Short passes completed', 'Medium passes completed', 'Long passes completed', 'Backward passes completed', 'Passes completed to left', 'Passes completed to right', 'Crossing accuracy (%)', 'Crosses completed', 'Free-kicks taken', 'Times in possession', 'Passes into attacking third', 'Passes into key play area'