# Web Scraping through TransferMarkt for Players from the Top 10 Leagues

## Importing Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random

## Setup

In [2]:
BASE_URL = "https://www.transfermarkt.com"

LEAGUE_INFO = {
    "serie_a": {"slug": "serie-a", "code": "IT1"},
    "la_liga": {"slug": "laliga", "code": "ES1"},
    "bundesliga": {"slug": "bundesliga", "code": "L1"},
    "ligue_1": {"slug": "ligue-1", "code": "FR1"},
    "premier_league": {"slug": "premier-league", "code": "GB1"},
    "eredivisie": {"slug": "eredivisie", "code": "NL1"},
    "liga_portugal": {"slug": "liga-portugal", "code": "PO1"},
    "süper_lig": {"slug": "sueper-lig", "code": "TR1"},
    "jupiler_pro_league": {"slug": "jupiler-pro-league", "code": "BE1"},
    "austrian_bundesliga": {"slug": "bundesliga-at", "code": "A1"}
}

def get_soup_with_selenium(url, wait_for_table=False):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36"
    )
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(url)
        if wait_for_table:
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "table.items"))
            )
        time.sleep(2)
        html = driver.page_source
    except Exception as e:
        print(f"⚠️ Failed to fetch {url}: {e}")
        html = ""
    finally:
        driver.quit()

    return BeautifulSoup(html, "html.parser")

## Scraping

### Getting All Clubs URL

In [3]:
def get_club_links(slug, code):
    league_url = f"{BASE_URL}/{slug}/startseite/wettbewerb/{code}"
    soup = get_soup_with_selenium(league_url)
    table = soup.find("table", class_="items")
    club_links = []
    if table:
        for link in table.select("td.hauptlink a[href*='/startseite/verein']"):
            href = link.get("href")
            if href:
                base_href = href.split("/saison_id/")[0]
                club_url = f"{BASE_URL}{base_href}/saison_id/2024"
                club_links.append(club_url)
    return list(set(club_links))

### Extracting Player Data From the Club's Page

In [4]:
def get_players_from_club(club_url, league_name):
    detailed_url = club_url + "/plus/1"
    compact_url = club_url

    detailed_soup = get_soup_with_selenium(detailed_url, wait_for_table=True)
    compact_soup = get_soup_with_selenium(compact_url, wait_for_table=True)

    detailed_table = detailed_soup.find("table", class_="items")
    compact_table = compact_soup.find("table", class_="items")

    if not detailed_table or not compact_table:
        print(f"❌ Could not load both tables for: {club_url}")
        return []

    detailed_rows = detailed_table.select("tbody > tr.odd, tbody > tr.even")
    compact_rows = compact_table.select("tbody > tr.odd, tbody > tr.even")

    players = []
    club_name_tag = detailed_soup.select_one("h1")
    club_name = club_name_tag.text.strip() if club_name_tag else "N/A"

    for detailed_row, compact_row in zip(detailed_rows, compact_rows):
        try:
            name_tag = detailed_row.select_one("td.posrela table.inline-table tr td.hauptlink a")
            name = name_tag.text.strip().replace(",", "") if name_tag else "N/A"
            
            profile_href = name_tag['href'] if name_tag and name_tag.has_attr('href') else None
            profile_url = BASE_URL + profile_href if profile_href else None

            age_tag = compact_row.select_one("td.zentriert:nth-of-type(3)")
            age = age_tag.text.strip() if age_tag else "N/A"

            nationality_imgs = compact_row.select("td.zentriert:nth-of-type(4) img")
            nationality = " / ".join([img.get("title", "") for img in nationality_imgs])

            position_tag = detailed_row.select_one("td.posrela table.inline-table tr:nth-of-type(2) td")
            position = position_tag.text.strip().replace(",", "") if position_tag else "N/A"

            market_value_tag = detailed_row.select_one("td.rechts.hauptlink")
            market_value = market_value_tag.text.strip().replace(",", "") if market_value_tag else "N/A"

            players.append({
                "Name": name,
                "Age": age,
                "Position": position,
                "Club Name": club_name,
                "League Name": league_name.replace("_", " ").title(),
                "Market Value": market_value,
                "Nationality": nationality,
                "Profile URL": profile_url
            })

            time.sleep(random.uniform(10, 20))

        except Exception as e:
            print(f"⚠️ Error parsing row: {e}")
            continue

    return players

### Scraping Through All The Leagues

In [5]:
for league_name, info in LEAGUE_INFO.items():
    print(f"\n=== Scraping {league_name.replace('_', ' ').title()} ===")
    all_players = []
    club_links = get_club_links(info['slug'], info['code'])
    print(f"Found {len(club_links)} clubs.")

    for idx, club_url in enumerate(club_links):
        print(f"\nScraping club {idx+1}/{len(club_links)}: {club_url}")
        players = get_players_from_club(club_url, league_name)
        all_players.extend(players)

        time.sleep(random.uniform(10, 20))

    df = pd.DataFrame(all_players)
    df.to_csv(f"../../../data/raw/transfermarkt/{league_name}_players.csv", index=False)
    print(f"📁 Saved data to ../../../data/raw/transfermarkt/{league_name}_players.csv")


=== Scraping Serie A ===
Found 20 clubs.

Scraping club 1/20: https://www.transfermarkt.com/ac-florenz/startseite/verein/430/saison_id/2024

Scraping club 2/20: https://www.transfermarkt.com/fc-turin/startseite/verein/416/saison_id/2024

Scraping club 3/20: https://www.transfermarkt.com/us-lecce/startseite/verein/1005/saison_id/2024

Scraping club 4/20: https://www.transfermarkt.com/atalanta-bergamo/startseite/verein/800/saison_id/2024

Scraping club 5/20: https://www.transfermarkt.com/inter-mailand/startseite/verein/46/saison_id/2024

Scraping club 6/20: https://www.transfermarkt.com/como-1907/startseite/verein/1047/saison_id/2024

Scraping club 7/20: https://www.transfermarkt.com/genua-cfc/startseite/verein/252/saison_id/2024

Scraping club 8/20: https://www.transfermarkt.com/us-sassuolo/startseite/verein/6574/saison_id/2024

Scraping club 9/20: https://www.transfermarkt.com/udinese-calcio/startseite/verein/410/saison_id/2024

Scraping club 10/20: https://www.transfermarkt.com/ssc-n