In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException

In [2]:
# This is the url of the website we will extract information from
url = 'https://www.footballtransfers.com/us/players/uk-premier-league'

In [3]:
# Set up Selenium WebDriver instance for Chrome
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(url)

In [4]:
# Find where the player's information is located on the page
players = driver.find_element(By.XPATH, ".//tbody[contains(@id, 'player-table-body')]")

In [5]:
# Find information for one player

In [6]:
# Find the player's name
player_name = players.find_element(By.XPATH, ".//div[contains(@class, 'text')]").text.split('\n')[0]

In [7]:
# Find the player's position
player_position = players.find_element(By.XPATH, ".//div[contains(@class, 'text')]").text.split('\n')[1]

In [8]:
# Find the player's skill rating
player_skill = players.find_element(By.XPATH, ".//div[contains(@class, 'table-skill')]").text.split('\n')[0]

In [9]:
# Find the player's pot
player_pot = players.find_element(By.XPATH, ".//div[contains(@class, 'table-skill')]").text.split('\n')[1]

In [10]:
# Find the player's estimated transfer value
player_etv = players.find_element(By.XPATH, ".//td[contains(@class, 'text-center')]").text[1:]

In [11]:
# Create container of all players
container = driver.find_element(By.XPATH, ".//tbody[contains(@id, 'player-table-body')]")

In [12]:
# Find all players on the page
all_players = driver.find_elements(By.XPATH, ".//tbody[contains(@id, 'player-table-body')]")

In [13]:
# Create the for loop to find the information of the first 250 players

In [14]:
# Initiate empty lists where we'll put the player information
names = []
positions = []
skills = []
pots = []
ETVs = []

# Start at page 1
current_page = 1

# Go through the first 10 pages of players
while current_page <= 10:
    try:
        container = driver.find_element(By.XPATH, ".//tbody[contains(@id, 'player-table-body')]")
        all_players = container.find_elements(By.TAG_NAME, "tr")

        for player in all_players:
            name = player.find_element(By.XPATH, ".//div[contains(@class, 'text')]").text.split('\n')[0]
            position = player.find_element(By.XPATH, ".//div[contains(@class, 'text')]").text.split('\n')[1]
            skill = player.find_element(By.XPATH, ".//div[contains(@class, 'table-skill')]").text.split('\n')[0]
            pot = player.find_element(By.XPATH, ".//div[contains(@class, 'table-skill')]").text.split('\n')[1]
            etv = player.find_element(By.XPATH, ".//td[contains(@class, 'text-center')]").text[1:]

            names.append(name)
            positions.append(position)
            skills.append(skill)
            pots.append(pot)
            ETVs.append(etv)

        current_page += 1

        try:
            next_button = driver.find_element(By.XPATH, ".//button[contains(@class, 'pagination_next_button')]")
            next_button.click() # Go to next page
        except:
            break  # Exit the loop if the "Next" button is not found

    except StaleElementReferenceException: # If an element becomes stale, this will allow the program to continue
        continue

In [15]:
# Make data into a data frame
df = pd.DataFrame({'name': names,
                 'position': positions,
                'skill': skills,
                'pot': pots,
                'etv': ETVs})

In [16]:
# Make data frame a csv file
df.to_csv('epl_players.csv')

In [17]:
# Double check to make sure there is no missing information
df.isna().sum()

name        0
position    0
skill       0
pot         0
etv         0
dtype: int64