# Web Scraping through TransferMarkt for Premier League Players

## Importing Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random

## Setup

In [2]:
def get_soup_with_selenium(url):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36"
    )
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(url)
        time.sleep(5)
        html = driver.page_source
    except Exception as e:
        print(f"⚠️ Failed to fetch {url}: {e}")
        html = ""
    finally:
        driver.quit()

    return BeautifulSoup(html, "html.parser")

BASE_URL = "https://www.transfermarkt.com"
LEAGUE_URL = f"{BASE_URL}/premier-league/startseite/wettbewerb/GB1"

## Scraping

### Getting all PL Clubs URL

In [3]:
def get_club_links():
    soup = get_soup_with_selenium(LEAGUE_URL)
    table = soup.find("table", class_="items")
    club_links = []
    if table:
        for link in table.select("td.hauptlink a[href*='/startseite/verein']"):
            href = link.get("href")
            if href:
                club_url = BASE_URL + href.replace("startseite", "kader")
                club_links.append(club_url)
    return list(set(club_links))

### Extracting Player Data from the Club's Squad Page

In [4]:
def get_players_from_club(club_url):
    attempts = 3
    for i in range(attempts):
        soup = get_soup_with_selenium(club_url)
        table = soup.find("table", class_="items")
        if table:
            break
        else:
            print(f"⚠️ Attempt {i+1}/{attempts} failed at {club_url}, retrying...")
            time.sleep(5)
    else:
        print(f"❌ No player table found at {club_url} (503 Service Unavailable)")
        return []

    players = []
    club_name_tag = soup.select_one("h1")
    club_name = club_name_tag.text.strip() if club_name_tag else "N/A"

    for row in table.select("tbody > tr.odd, tbody > tr.even"):
        try:
            name_tag = row.select_one("td.posrela table.inline-table tr td.hauptlink a")
            name = name_tag.text.strip().replace(",", "") if name_tag else "N/A"

            age = row.select_one("td.zentriert:nth-of-type(3)")
            age = age.text.strip().replace(",", "") if age else "N/A"

            nationality_imgs = row.select("td.zentriert:nth-of-type(4) img")
            nationality = " / ".join([img.get("title", "") for img in nationality_imgs])

            position_tag = row.select_one("td.posrela table.inline-table tr:nth-of-type(2) td")
            position = position_tag.text.strip().replace(",", "") if position_tag else "N/A"

            market_value_tag = row.select_one("td.rechts.hauptlink")
            market_value = market_value_tag.text.strip().replace(",", "") if market_value_tag else "N/A"

            players.append({
                "Name": name,
                "Age": age,
                "Position": position,
                "Nationality": nationality,
                "Market Value": market_value,
                "Club Name": club_name,
            })
        except Exception as e:
            print(f"⚠️ Error parsing row: {e}")
            continue

    return players

### Looping Through All Clubs to Collect Player Data

In [5]:
def scrape_premier_league_players():
    all_players = []
    club_links = get_club_links()
    print(f"Found {len(club_links)} clubs.")

    for idx, club_url in enumerate(club_links):
        print(f"Scraping club {idx+1}/{len(club_links)}: {club_url}")
        players = get_players_from_club(club_url)
        all_players.extend(players)
        time.sleep(random.uniform(6, 10))  # polite, human-like delay

    df = pd.DataFrame(all_players)
    df.to_csv("../../data/players/raw/premier_league_players.csv", index=False)
    print("Saved data to ../../data/players/raw/premier_league_players.csv")

## Running Process

In [6]:
scrape_premier_league_players()

Found 20 clubs.
Scraping club 1/20: https://www.transfermarkt.com/nottingham-forest/kader/verein/703/saison_id/2024
Scraping club 2/20: https://www.transfermarkt.com/brighton-amp-hove-albion/kader/verein/1237/saison_id/2024
Scraping club 3/20: https://www.transfermarkt.com/crystal-palace/kader/verein/873/saison_id/2024
Scraping club 4/20: https://www.transfermarkt.com/fc-everton/kader/verein/29/saison_id/2024
Scraping club 5/20: https://www.transfermarkt.com/tottenham-hotspur/kader/verein/148/saison_id/2024
Scraping club 6/20: https://www.transfermarkt.com/ipswich-town/kader/verein/677/saison_id/2024
Scraping club 7/20: https://www.transfermarkt.com/west-ham-united/kader/verein/379/saison_id/2024
Scraping club 8/20: https://www.transfermarkt.com/fc-fulham/kader/verein/931/saison_id/2024
Scraping club 9/20: https://www.transfermarkt.com/fc-chelsea/kader/verein/631/saison_id/2024
Scraping club 10/20: https://www.transfermarkt.com/fc-southampton/kader/verein/180/saison_id/2024
Scraping cl