In [1]:
from pathlib import Path
import cloudscraper
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
CLUB = "club"
NAME = "name"
AGE = "age"
NATIONALITY = "nationality"
POSITION = "position"
SHORT_POS = "short_pos"
MARKET_VALUE = "market_value"
DEALING_CLUB = "dealing_club"
DEALING_COUNTRY = "dealing_country"
FEE = "fee"
MOVEMENT = "movement"

COLUMN_HEADERS = [
    CLUB,
    NAME,
    AGE,
    NATIONALITY,
    POSITION,
    SHORT_POS,
    MARKET_VALUE,
    DEALING_CLUB,
    DEALING_COUNTRY,
    FEE,
    MOVEMENT
]


def _export_csv(df, league_name, season_id):
    """Exports a league dataframe to the corresponding season's folder."""

    output_file = f"{league_name}.csv"
    output_dir = Path(f"data/{season_id}")
    output_dir.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_dir / output_file, index=False, encoding='utf-8-sig')


def _get_soup(url):

    headers = {'User-Agent': (
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
    )}
    try:
        scraper = cloudscraper.create_scraper()
        response = scraper.get(url, headers=headers)
        print(response)
    except scraper.exceptions.RequestException as e:
        print(e)
        print("Failed with input url: " + url)

    return BeautifulSoup(response.content, 'lxml')


def _get_transfers(soup):
    clubs = [tag.get_text(strip=True) for tag in soup.select(".content-box-headline--logo a") if tag.get_text(strip=True)]
    print(clubs)
    
    tables = [tag.findChild() for tag in soup.find_all('div', {'class': 'responsive-table'})]
    tables_in = tables[::2]
    tables_out = tables[1::2]
    tables.clear()

    df_list = []
    for (club, table_in, table_out) in zip(clubs, tables_in, tables_out):
        df_list.append(_table_to_df(club, table_in, "in"))
        df_list.append(_table_to_df(club, table_out, "out"))

    if not df_list:
        return pd.DataFrame(columns=COLUMN_HEADERS)

    return pd.concat(df_list, ignore_index=True)

def _parse_hyphenated_string(s):
    """Parses a hyphenated range into a list of integers."""

    # In: "2004-2007"
    # Out: [2004, 2005, 2006, 2007]
    list_of_lists = [list(range(*[int(second) + int(first) 
        for second, first in enumerate(substring.split('-'))])) 
        if '-' in substring else [int(substring)]
        for substring in s.split()]
    
    return [item for sublist in list_of_lists for item in sublist]


def _table_to_df(club, table, movement=None):
    """Creates a dataframe of transfer data read from an html table."""

    age_class = 'alter-transfer-cell'
    nationality_class = 'nat-transfer-cell'
    position_class = 'pos-transfer-cell'
    shortpos_class = 'kurzpos-transfer-cell'
    marketvalue_class = 'mw-transfer-cell'
    dealingclub_class = 'verein-flagge-transfer-cell'
    fee_class = 'rechts'

    data = {header: [] for header in COLUMN_HEADERS}

    def update_data(key, tag, image=False):
        """Helper function to update data dict."""

        if not tag:
            data[key].append(None)
        elif not image and tag.get_text():
            data[key].append(tag.get_text(strip=True))
        elif image and tag.get('alt'):
            data[key].append(tag.get('alt'))
        else:
            data[key].append(None)

    trs = table.find_all('tr')
    for tr in trs[1:]:
        data[CLUB].append(club)
        data[MOVEMENT].append(movement)
        tds = tr.find_all('td')
        if len(tds) == 1:
            return
        for td in tds:
            td_class = td.get('class')
            if td_class:
                if age_class in td_class:
                    update_data(AGE, td)
                elif nationality_class in td_class:
                    td_child = td.findChild()
                    update_data(NATIONALITY, td_child, image=True)
                elif position_class in td_class:
                    update_data(POSITION, td)
                elif shortpos_class in td_class:
                    update_data(SHORT_POS, td)
                elif marketvalue_class in td_class:
                    update_data(MARKET_VALUE, td)
                elif dealingclub_class in td_class:
                    td_country = td.find('img')
                    td_club = td.find('a')
                    update_data(DEALING_COUNTRY, td_country, image=True)
                    update_data(DEALING_CLUB, td_club)
                elif fee_class in td_class:
                    update_data(FEE, td)
            else:
                td_child = td.findChild()
                update_data(NAME, td_child)

    return pd.DataFrame.from_dict(data)

In [3]:
def scrape_transfermarkt(
    league_name,
    league_id,
    season_id,
    base="https://transfermarkt.com",
    window="",
    loans="3",
    internal="0",
    clean=False,
    write=False
):

    season = str(season_id)
    windows = {
        "s": "summer",
        "w": "winter"
    }

    if window:
        url = base + (
            f"/{league_name}/transfers/wettbewerb/{league_id}/plus/?"
            f"saison_id={season_id}&s_w={window}&leihe={loans}"
            f"&intern={internal}"
        )
        soup = _get_soup(url)
        transfers_df = _get_transfers(soup)
        transfers_df['window'] = windows[window]
    else:
        df_list = []
        for k in windows.keys():
            url = base + (
                f"/{league_name}/transfers/wettbewerb/{league_id}/plus/?"
                f"saison_id={season_id}&s_w={k}&leihe={loans}"
                f"&intern={internal}"
            )
            soup = _get_soup(url)
            df = _get_transfers(soup)
            df['window'] = windows[k]
            df_list.append(df)
        transfers_df = (pd.concat(df_list, ignore_index=True)
            .sort_values(by=['club', 'window'], ignore_index=True)
        )

    transfers_df['league'] = league_name
    transfers_df['season'] = season
    
    if clean:
        transfers_df = tidy_transfers(transfers_df)
    if write:
        _export_csv(transfers_df, league_name, season)

    return transfers_df


def tidy_transfers(dataframe):
    
    def format_fees_and_loans(df):

        if pd.isna(df.fee):
            return df

        df.fee = df.fee.lower()
        fee_string = lambda s: df.fee.startswith(s)

        if fee_string("end of loan"):
            df.fee = "$0"
            df.is_loan = True
            df.loan_status = "end of loan"
        elif fee_string("loan fee"):
            df.fee = df.fee.replace("loan fee:", '')
            df.is_loan = True
            df.loan_status = "loan with fee"
        elif fee_string("loan transfer"):
            df.fee = "$0"
            df.is_loan = True
            df.loan_status = "free loan"
        elif fee_string("free transfer"):
            df.fee = "$0"

        return df


    def value_as_numeric(val):

        if pd.isna(val) or val == '-' or val == '?':
            return np.nan

        if val == '0':
            return 0

        # Hack to get around some entries using comma decimal separators
        val = val[1:].lower().replace(',', '.')
        if val and val[-1] == 'm':
            val = pd.to_numeric(val[:-1]) * 1e6
        elif val and val[-3:] == 'th.':
            val = pd.to_numeric(val[:-3]) * 1e3
        else:
            val = pd.to_numeric(val, errors='coerce')

        return val
    
    dataframe = (dataframe.assign(is_loan = False, loan_status = '')
        .apply(format_fees_and_loans, axis=1)
        .assign(
            market_value = lambda df: df.market_value.apply(value_as_numeric),
            fee = lambda df: df.fee.apply(value_as_numeric)
        )
    )
    dataframe.age = pd.to_numeric(dataframe.age, errors='coerce')
    dataframe.season = pd.to_datetime(dataframe.season).dt.year
    for col in ['nationality', 'position', 'short_pos', 'dealing_club', 'dealing_country']:
        dataframe[col].fillna('', inplace=True)
    dataframe.league = dataframe.league.str.replace('-', ' ').str.title()    
    
    return dataframe

In [4]:
print('\n'.join((
    "\nSelect currency (default is euro):",
    "[1] EUR €",
    "[2] GBP £",
    "[3] USD $"
)))
while True:
    options = list(range(1, 4))
    try:
        localization = int(input("===> ") or 1)
        options.index(localization)
        break
    except ValueError:
        print("Error: Please input one of 1, 2, or 3.")
        continue

print('\n'.join((
    "\nSelect league(s), e.g. '1', '3 5', '6-10' (default is top 5):",
    "[1]  ENG Premier League", 
    "[2]  ESP La Liga",
    "[3]  GER Bundesliga",
    "[4]  ITA Serie A",
    "[5]  FRA Ligue 1",
    "[6]  POR Liga Portugal",
    "[7]  TUR Süper Lig",
    "[8]  NLD Eredivisie",
    "[9]  BEL Jupiler Pro League",
    "[10] RUS Premier Liga",
    "[11] GRE Super League 1",
    "[12] AUS Bundesliga",
    "[13] SCO Scottish Premiership",
    "[14] SWI Super League",
    "[15] UKR Premier Liga",
    "[16] PLO Ekstraklasa",
    "[17] DMK Superliga",
    "[18] SER Super liga Srbije",
    "[19] SWE Allsvenskan",
    "[20] CRO SuperSport HNL"

)))
while True:
    options = list(range(1, 11))
    try:
        choices = _parse_hyphenated_string(input("===> "))
        if not choices:
            leagues = list(range(1, 6))
            break
        elif not all(league in options for league in choices):
            print(f"Error: Please input within the range 1-{len(options)}.")
            continue
    except ValueError:
        print(f"Error: Please input within the range 1-{len(options)}.")
        continue

    leagues = choices
    break

# English top-flight data are no longer available pre-1992/93 (?)
print('\n'.join((
    "\nEnter desired seasons as years (default is current season).",
    "Years should be input as the first calendar year in a season, e.g. '2015' for the 2015/16 season.",
    "You can input both indiviudal years and year ranges, e.g. '1992 2004-2007'."
)))
while True:
    current_year = datetime.now().year
    valid_seasons = list(range(1992, current_year + 1))
    try:
        choices = _parse_hyphenated_string(input("===> "))
        if not choices:
            seasons = [current_year]
            break
        elif not all(season in valid_seasons for season in choices):
            print("Error: Seasons are limited to the range 1992-{}.".format(current_year))
            continue
    except ValueError:
        print("Error: Seasons are limited to the range 1992-{}.".format(current_year))
        continue
    
    seasons = choices
    break
    
print('\n'.join((
    "\nSelect transfer window (default is both):",
    "[1] Both",
    "[2] Summer",
    "[3] Winter"
)))
while True:
    options = list(range(1, 4))
    try:
        window = int(input("===> ") or 1)
    except ValueError:
        print("Error: Please input one of 1, 2, or 3.")
        continue
    if window not in options:
        print("Error: Please input one of 1, 2, or 3.")
        continue
    elif window == 1:
        window = ""
    elif window == 2:
        window = "s"
    elif window == 3:
        window = "w"
    break

print('\n'.join((
    "\nSelect how to handle loan transfers (default is without players back from loan):",
    "[1] Exclude loans",
    "[2] Include loans",
    "[3] Loans only",
    "[4] Without players back from loan"
)))
while True:
    options = list(range(1, 5))
    try:
        loans = int(input("===> ") or 4)
    except ValueError:
        print("Error: Please input one of 1, 2, 3, or 4.")
        continue
    if loans not in options:
        print("Error: Please input one of 1, 2, 3, or 4.")
        continue
    else:
        loans -= 1
        break

print("\nExclude player movements within clubs (Y/n)?")
while True:
    options = ['y', 'n', 'yes', 'no']
    choice = input("===> ").lower()
    if not choice:
        internal = 0
        break
    if choice not in options:
        print("Please enter y/n.")
        continue
    elif choice == 'y' or choice == 'yes':
        internal = 0
    elif choice == 'n' or choice == 'no':
        internal = 1
    break

print("\nClean the data with tidy_transfers (Y/n)?")
while True:
    options = ['y', 'n', 'yes', 'no']
    choice = input("===> ").lower()
    if not choice:
        clean = True
        break
    if choice not in options:
        print("Please enter y/n.")
        continue
    elif choice == 'y' or choice == 'yes':
        clean = True
    elif choice == 'n' or choice == 'no':
        clean = False
    break

sites = {
    1: "https://transfermarkt.com",
    2: "https://www.transfermarkt.co.uk",
    3: "https://www.transfermarkt.us"
}
base = sites[localization]
league_ids = {
    1: "GB1",
    2: "ES1",
    3: "L1",
    4: "IT1",
    5: "FR1",
    6: "PO1",
    7: "TR1",
    8: "NL1",
    9: "BE1",
    10: "RU1",
    11: "GR1",
    12: "A1",
    13: "SC1",
    14: "C1",
    15: "UKR1",
    16: "PL1",
    17: "DK1",
    18: "SER1",
    19: "SE1",
    20: "KR1"
}
league_names = {
    1: "premier-league",
    2: "laliga",
    3: "1-bundesliga",
    4: "serie-a",
    5: "ligue-1",
    6: "liga-portugal",
    7: "super-lig",
    8: "eredivisie",
    9: "jupiler-pro-league",
    10: "premier-liga",
    11: "super-league-1",
    12: "bundesliga",
    13: "scottish-premiership",
    14: "super-league",
    15: "premier-liga",
    16: "pko-bp-ekstraklasa",
    17: "superliga",
    18: "super-liga-srbije",
    19: "allsvenskan",
    20: "supersport-hnl" 
}
loans = str(loans)
internal = str(internal)

print(f"\nNow scraping data for {len(leagues)} league(s) over {len(seasons)} season(s).")
i = 1
n = len(leagues) * len(seasons)
for season in seasons:
    print(f"Scraping the {season}/{str(season + 1)[-2:]} season:")
    for k in leagues:
        league_name = league_names[k]
        league_id = league_ids[k]
        season_id = str(season)

        print('\x1b[2K' + "Requesting...", end='\r', flush=True)
        _ = scrape_transfermarkt(
            league_name=league_name,
            league_id=league_id,
            season_id=season_id,
            base=base,
            window=window,
            loans=loans,
            internal=internal,
            clean=clean,
            write=True
            )
        print(f"({i}/{n}) Done with: {league_id + ' ' + league_name}")
        
        i += 1

print("\nDone!\n")


Select currency (default is euro):
[1] EUR €
[2] GBP £
[3] USD $

Select league(s), e.g. '1', '3 5', '6-10' (default is top 5):
[1]  ENG Premier League
[2]  ESP La Liga
[3]  GER Bundesliga
[4]  ITA Serie A
[5]  FRA Ligue 1
[6]  POR Liga Portugal
[7]  TUR Süper Lig
[8]  NLD Eredivisie
[9]  BEL Jupiler Pro League
[10] RUS Premier Liga
[11] GRE Super League 1
[12] AUS Bundesliga
[13] SCO Scottish Premiership
[14] SWI Super League
[15] UKR Premier Liga
[16] PLO Ekstraklasa
[17] DMK Superliga
[18] SER Super liga Srbije
[19] SWE Allsvenskan
[20] CRO SuperSport HNL

Enter desired seasons as years (default is current season).
Years should be input as the first calendar year in a season, e.g. '2015' for the 2015/16 season.
You can input both indiviudal years and year ranges, e.g. '1992 2004-2007'.

Select transfer window (default is both):
[1] Both
[2] Summer
[3] Winter

Select how to handle loan transfers (default is without players back from loan):
[1] Exclude loans
[2] Include loans
[3] Loa

In [5]:
# import cloudscraper
# from bs4 import BeautifulSoup
# url = 'https://www.transfermarkt.com/premier-league/transfers/wettbewerb/GB1/plus/?saison_id=2021&s_w=s&leihe=1&intern=0&intern=1'
# scraper = cloudscraper.create_scraper()
# response = scraper.get(url)
# soup = BeautifulSoup(response.text,"html.parser")
# clubs = [club.get_text(strip=True) for club in soup.select(".content-box-headline--logo a")]
# print(clubs)