# Web Scraping through TransferMarkt for Players from the other Top 5 Leagues

## Importing Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random

## Setup

In [2]:
def get_soup_with_selenium(url):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36"
    )
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get(url)
        time.sleep(5)
        html = driver.page_source
    except Exception as e:
        print(f"⚠️ Failed to fetch {url}: {e}")
        html = ""
    finally:
        driver.quit()

    return BeautifulSoup(html, "html.parser")

BASE_URL = "https://www.transfermarkt.com"
LEAGUE_INFO = {
    "serie_a": {"slug": "serie-a", "code": "IT1"},
    "la_liga": {"slug": "laliga", "code": "ES1"},
    "bundesliga": {"slug": "bundesliga", "code": "L1"},
    "ligue_1": {"slug": "ligue-1", "code": "FR1"}
}

## Scraping

### Getting all Clubs URL

In [3]:
def get_club_links(slug, code):
    league_url = f"{BASE_URL}/{slug}/startseite/wettbewerb/{code}"
    soup = get_soup_with_selenium(league_url)
    table = soup.find("table", class_="items")
    club_links = []
    if table:
        for link in table.select("td.hauptlink a[href*='/startseite/verein']"):
            href = link.get("href")
            if href:
                club_url = BASE_URL + href.replace("startseite", "kader")
                club_links.append(club_url)
    return list(set(club_links))

### Extracting Player Data from Club's Page

In [4]:
def get_players_from_club(club_url):
    attempts = 3
    for i in range(attempts):
        soup = get_soup_with_selenium(club_url)
        table = soup.find("table", class_="items")
        if table:
            break
        else:
            print(f"⚠️ Attempt {i+1}/{attempts} failed at {club_url}, retrying...")
            time.sleep(5)
    else:
        print(f"❌ No player table found at {club_url} (503 Service Unavailable)")
        return []

    players = []
    club_name_tag = soup.select_one("h1")
    club_name = club_name_tag.text.strip() if club_name_tag else "N/A"

    for row in table.select("tbody > tr.odd, tbody > tr.even"):
        try:
            name_tag = row.select_one("td.posrela table.inline-table tr td.hauptlink a")
            name = name_tag.text.strip().replace(",", "") if name_tag else "N/A"

            age = row.select_one("td.zentriert:nth-of-type(3)")
            age = age.text.strip().replace(",", "") if age else "N/A"

            nationality_imgs = row.select("td.zentriert:nth-of-type(4) img")
            nationality = " / ".join([img.get("title", "") for img in nationality_imgs])

            position_tag = row.select_one("td.posrela table.inline-table tr:nth-of-type(2) td")
            position = position_tag.text.strip().replace(",", "") if position_tag else "N/A"

            market_value_tag = row.select_one("td.rechts.hauptlink")
            market_value = market_value_tag.text.strip().replace(",", "") if market_value_tag else "N/A"

            players.append({
                "Name": name,
                "Age": age,
                "Position": position,
                "Nationality": nationality,
                "Market Value": market_value,
                "Club Name": club_name
            })
        except Exception as e:
            print(f"⚠️ Error parsing row: {e}")
            continue

    return players

### Scraping through all the Leagues

In [5]:
for league_name, info in LEAGUE_INFO.items():
    print(f"\n=== Scraping {league_name.replace('_', ' ').title()} ===")
    all_players = []
    club_links = get_club_links(info['slug'], info['code'])
    print(f"Found {len(club_links)} clubs.")

    for idx, club_url in enumerate(club_links):
        print(f"Scraping club {idx+1}/{len(club_links)}: {club_url}")
        players = get_players_from_club(club_url)
        all_players.extend(players)
        time.sleep(random.uniform(6, 10))

    df = pd.DataFrame(all_players)
    df.to_csv(f"../data/raw/{league_name}_players.csv", index=False)
    print(f"Saved data to data/raw/{league_name}_players.csv")


=== Scraping Serie A ===
Found 20 clubs.
Scraping club 1/20: https://www.transfermarkt.com/ssc-neapel/kader/verein/6195/saison_id/2024
Scraping club 2/20: https://www.transfermarkt.com/ac-mailand/kader/verein/5/saison_id/2024
Scraping club 3/20: https://www.transfermarkt.com/fc-bologna/kader/verein/1025/saison_id/2024
Scraping club 4/20: https://www.transfermarkt.com/fc-empoli/kader/verein/749/saison_id/2024
Scraping club 5/20: https://www.transfermarkt.com/como-1907/kader/verein/1047/saison_id/2024
Scraping club 6/20: https://www.transfermarkt.com/parma-calcio-1913/kader/verein/130/saison_id/2024
Scraping club 7/20: https://www.transfermarkt.com/us-lecce/kader/verein/1005/saison_id/2024
Scraping club 8/20: https://www.transfermarkt.com/genua-cfc/kader/verein/252/saison_id/2024
Scraping club 9/20: https://www.transfermarkt.com/atalanta-bergamo/kader/verein/800/saison_id/2024
Scraping club 10/20: https://www.transfermarkt.com/udinese-calcio/kader/verein/410/saison_id/2024
Scraping club