In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from tqdm import tqdm

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}

In [None]:
# Функция для получения списка ссылок на профили клубов, принадлежащие
# определённой лиге, по ссылке на эту лигу.
def get_clubs_from_league(league_link: str):
    """
    :param league_link: ссылка на профиль лиги на transfermarkt.com
    :return: список ссылок на клубы, находящиеся в лиге.
    """
    html_text = requests.get(
        league_link,
        headers = headers
    )

    site = BeautifulSoup(html_text.content, "lxml")
    clubs_box = site.find("div", class_ = "responsive-table")
    clubs_box = clubs_box.find("table", class_ = "items")
    clubs_link = []
    clubs_classes = clubs_box.find_all("tr", class_ = "odd") + site.find_all("tr", class_ = "even")[:-2]
    for club_class in clubs_classes:
        club_a_box = club_class.find("a")
        clubs_link.append("https://www.transfermarkt.com" + club_a_box["href"])

    return clubs_link

In [None]:
# Функция для получения списка ссылок на профили игроков, принадлежащие
# определённому клубу, по ссылке на этот клуб.
def get_players_from_club(club_link):
      """
      :param club_libk: ссылка на профиль клуба на transfermarkt.com
      :return: список ссылок на игроков клуба.
      """
      html_text = requests.get(
          club_link,
          headers = headers
      )

      site = BeautifulSoup(html_text.content, "lxml")
      players_box = site.find("div", class_ = "responsive-table")
      players_box = players_box.find("table", class_ = "items")

      players_link = []
      players_classes = players_box.find_all("tr", class_ = "odd") + players_box.find_all("tr", class_ = "even")
      for player_class in players_classes:
          player = player_class.find("td", class_ = "posrela")
          player_link_box = player.find("td", class_ = "hauptlink")
          player_a_box = player_link_box.find("a")
          players_link.append("https://www.transfermarkt.com" + player_a_box["href"])

      return players_link

In [None]:
# Функция для получения списка лиг по названию региона.
def get_leagues(region='europa'):
  """
  :param region: регион, откуда парсятся ссылки клубов: Европа (europa) или Азия (asien)
  :return: список ссылок на профили лиг данного региона на сайте transfermarkt.com.
  """
  league_links = []
  if region == 'europa':
    pages = 19
  elif region == 'asien':
    pages = 4
  else:
    return []
  for i in tqdm(range(1, pages + 1)):
    link = f"https://www.transfermarkt.com/wettbewerbe/{region}/wettbewerbe?ajax=yw1&plus=1&page={i}"
    html_text = requests.get(
          link,
          headers = headers
      )
    site = BeautifulSoup(html_text.content, "lxml")
    league_soups = site.find_all(lambda tag: tag.get('class') == ['hauptlink'])
    league_links.append(list(map(lambda x: "https://www.transfermarkt.com" + x.find('a')['href'], league_soups)))
  return league_links

In [None]:
# Сбор ссылок лиг с Европы.
europe_links = get_leagues('europa')
europe_links

100%|██████████| 19/19 [01:16<00:00,  4.01s/it]


[['https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1',
  'https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1',
  'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1',
  'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1',
  'https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1',
  'https://www.transfermarkt.com/liga-portugal/startseite/wettbewerb/PO1',
  'https://www.transfermarkt.com/eredivisie/startseite/wettbewerb/NL1',
  'https://www.transfermarkt.com/super-lig/startseite/wettbewerb/TR1',
  'https://www.transfermarkt.com/jupiler-pro-league/startseite/wettbewerb/BE1',
  'https://www.transfermarkt.com/premier-liga/startseite/wettbewerb/RU1',
  'https://www.transfermarkt.com/super-league-1/startseite/wettbewerb/GR1',
  'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/A1',
  'https://www.transfermarkt.com/scottish-premiership/startseite/wettbewerb/SC1',
  'https://www.transfermarkt.com/super-league/

In [None]:
# Разбиение ссылок по тирам.
europe_merged_links = [link for sublist in europe_links for link in sublist]
top_5 = europe_merged_links[0 : 5]

tier_1 = europe_merged_links[0 : europe_merged_links.index("https://www.transfermarkt.com/championship/startseite/wettbewerb/GB2")]

tier_2 = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/championship/startseite/wettbewerb/GB2") :
                             europe_merged_links.index("https://www.transfermarkt.com/league-one/startseite/wettbewerb/GB3")]

tier_3 = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/league-one/startseite/wettbewerb/GB3") :
                             europe_merged_links.index("https://www.transfermarkt.com/league-two/startseite/wettbewerb/GB4")]

tier_4 = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/league-two/startseite/wettbewerb/GB4") :
                             europe_merged_links.index("https://www.transfermarkt.com/national-league/startseite/wettbewerb/CNAT")]

tier_5 = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/national-league/startseite/wettbewerb/CNAT") :
                             europe_merged_links.index("https://www.transfermarkt.com/national-league-north/startseite/wettbewerb/NLN6")]

tier_6 = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/national-league-north/startseite/wettbewerb/NLN6") :
                             europe_merged_links.index("https://www.transfermarkt.com/primavera-1/startseite/wettbewerb/IJ1")]

youth_league = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/primavera-1/startseite/wettbewerb/IJ1") :
                             europe_merged_links.index("https://www.transfermarkt.com/fa-cup/startseite/wettbewerb/FAC")]

domestic_cup = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/fa-cup/startseite/wettbewerb/FAC") :
                             europe_merged_links.index("https://www.transfermarkt.com/supercopa/startseite/wettbewerb/SUC")]

domestic_super_cup = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/supercopa/startseite/wettbewerb/SUC") :
                             europe_merged_links.index("https://www.transfermarkt.com/protathlima-cyta-championship-round/startseite/wettbewerb/ZYPM")]

play_offs = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/protathlima-cyta-championship-round/startseite/wettbewerb/ZYPM") :
                             europe_merged_links.index("https://www.transfermarkt.com/efl-cup/startseite/wettbewerb/CGB")]

league_cup = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/efl-cup/startseite/wettbewerb/CGB") :
                             europe_merged_links.index("https://www.transfermarkt.com/coppa-italia-primavera/startseite/wettbewerb/CITP")]

domestic_youth_cup = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/coppa-italia-primavera/startseite/wettbewerb/CITP") :
                             europe_merged_links.index("https://www.transfermarkt.com/premier-league-2/startseite/wettbewerb/GB21")]

reserve_league = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/premier-league-2/startseite/wettbewerb/GB21") :
                             europe_merged_links.index("https://www.transfermarkt.com/efl-trophy/startseite/wettbewerb/GBFL")]

further_cup = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/efl-trophy/startseite/wettbewerb/GBFL") :
                             europe_merged_links.index("https://www.transfermarkt.com/supercoppa-primavera/startseite/wettbewerb/SCIJ")]

national_youth_super_cup = europe_merged_links[europe_merged_links.index("https://www.transfermarkt.com/supercoppa-primavera/startseite/wettbewerb/SCIJ")]

In [None]:
# Словарь тиров лиг.
league_dict = {
    "top_5": top_5,
    "tier_1": tier_1,
    "tier_2": tier_2,
    "tier_3": tier_3,
    "tier_4": tier_4,
    "tier_5": tier_5,
    "tier_6": tier_6,
    "youth_league": youth_league,
    "domestic_cup": domestic_cup,
    "domestic_super_cup": domestic_super_cup,
    "play_offs": play_offs,
    "league_cup": league_cup,
    "domestic_youth_cup": domestic_youth_cup,
    "reserve_league": reserve_league,
    "further_cup": further_cup,
    "national_youth_super_cup": national_youth_super_cup,
}

In [None]:
# Сбор ссылок игроков по раздельным файлам.
errs = []
for leagues in (league_dict):
  print(f"\n\n{leagues}:")
  club_links = []
  for league in league_dict[leagues]:
    try:
      club_links.append(get_clubs_from_league(league))
    except:
      print(f"FILE: {leagues}; SKIPPED LEAGUE: {league}")
      errs.append(f"FILE: {leagues}; SKIPPED LEAGUE: {league}")
  club_merged_links = [link for sublist in club_links for link in sublist]
  player_links = []
  for club in club_merged_links:
    try:
      player_links.append(get_players_from_club(club))
    except:
      print(f"FILE: {leagues}; SKIPPED CLUB: {club}")
      errs.append(f"FILE: {leagues}; SKIPPED CLUB: {club}")

  player_merged_links = [link for sublist in player_links for link in sublist]
  pd.DataFrame(player_merged_links, columns=['link']).to_csv(f"{leagues}_links.csv")



tier_3:
FILE: tier_3; SKIPPED CLUB: https://www.transfermarkt.com/torpedo-2-moskau/startseite/verein/97258/saison_id/2023
FILE: tier_3; SKIPPED CLUB: https://www.transfermarkt.com/ska-rostov-na-donu/startseite/verein/6673/saison_id/2023
FILE: tier_3; SKIPPED CLUB: https://www.transfermarkt.com/khimik-august-vurnary/startseite/verein/68037/saison_id/2023


tier_4:
FILE: tier_4; SKIPPED CLUB: https://www.transfermarkt.com/stroitel-kamensk-shakhtinskiy/startseite/verein/117533/saison_id/2023
FILE: tier_4; SKIPPED CLUB: https://www.transfermarkt.com/sc-ritzing/startseite/verein/7445/saison_id/2023


tier_5:
FILE: tier_5; SKIPPED CLUB: https://www.transfermarkt.com/bc-viktoria-glesch-paffendorf/startseite/verein/40310/saison_id/2023


tier_6:
FILE: tier_6; SKIPPED CLUB: https://www.transfermarkt.com/sv-neukirchen-b-hl-blut/startseite/verein/37151/saison_id/2023


youth_league:
FILE: youth_league; SKIPPED CLUB: https://www.transfermarkt.com/akademia-ural-ekaterinburg-u16/startseite/verein/

In [None]:
# Сбор ссылок игроков по раздельным файлам.
asia_links = get_leagues('asien')
asia_links

100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


[['https://www.transfermarkt.com/saudi-pro-league/startseite/wettbewerb/SA1',
  'https://www.transfermarkt.com/uae-pro-league/startseite/wettbewerb/UAE1',
  'https://www.transfermarkt.com/j1-league/startseite/wettbewerb/JAP1',
  'https://www.transfermarkt.com/qatar-stars-league/startseite/wettbewerb/QSL',
  'https://www.transfermarkt.com/k-league-1/startseite/wettbewerb/RSK1',
  'https://www.transfermarkt.com/chinese-super-league/startseite/wettbewerb/CSL',
  'https://www.transfermarkt.com/persian-gulf-pro-league/startseite/wettbewerb/IRN1',
  'https://www.transfermarkt.com/a-league-men/startseite/wettbewerb/AUS1',
  'https://www.transfermarkt.com/thai-league/startseite/wettbewerb/THA1',
  'https://www.transfermarkt.com/ozbekiston-superligasi/startseite/wettbewerb/UZ1',
  'https://www.transfermarkt.com/liga-1-indonesia/startseite/wettbewerb/IN1L',
  'https://www.transfermarkt.com/indian-super-league/startseite/wettbewerb/IND1',
  'https://www.transfermarkt.com/malaysia-super-league/sta

In [None]:
# Разбиение ссылок по тирам.
asia_merged_links = [link for sublist in asia_links for link in sublist]

top_5 = asia_merged_links[0 : 5]

tier_1 = asia_merged_links[0 : asia_merged_links.index("https://www.transfermarkt.com/j2-league/startseite/wettbewerb/JAP2")]

tier_2 = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/j2-league/startseite/wettbewerb/JAP2") :
                             asia_merged_links.index("https://www.transfermarkt.com/j3-league/startseite/wettbewerb/JAP3")]

tier_3 = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/j3-league/startseite/wettbewerb/JAP3") :
                             asia_merged_links.index("https://www.transfermarkt.com/japan-football-league/startseite/wettbewerb/JFL")]

tier_4 = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/japan-football-league/startseite/wettbewerb/JFL") :
                             asia_merged_links.index("https://www.transfermarkt.com/kanto-soccer-league-div-1-/startseite/wettbewerb/KTOS")]

tier_5 = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/kanto-soccer-league-div-1-/startseite/wettbewerb/KTOS") :
                             asia_merged_links.index("https://www.transfermarkt.com/kansai-soccer-league-div-2-/startseite/wettbewerb/J6KA")]

tier_6 = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/kansai-soccer-league-div-2-/startseite/wettbewerb/J6KA") :
                             asia_merged_links.index("https://www.transfermarkt.com/prince-takamado-u18-premier-league-west/startseite/wettbewerb/J18W")]

youth_league = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/prince-takamado-u18-premier-league-west/startseite/wettbewerb/J18W") :
                             asia_merged_links.index("https://www.transfermarkt.com/kings-cup/startseite/wettbewerb/SAKC")]

domestic_cup = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/kings-cup/startseite/wettbewerb/SAKC") :
                             asia_merged_links.index("https://www.transfermarkt.com/super-cup/startseite/wettbewerb/INSC")]

domestic_super_cup = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/super-cup/startseite/wettbewerb/INSC") :
                             asia_merged_links.index("https://www.transfermarkt.com/lebanese-premier-league-championship-round/startseite/wettbewerb/LB1M")]

play_offs = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/lebanese-premier-league-championship-round/startseite/wettbewerb/LB1M") :
                             asia_merged_links.index("https://www.transfermarkt.com/j-league-cup/startseite/wettbewerb/JAPC")]

league_cup = asia_merged_links[asia_merged_links.index("https://www.transfermarkt.com/j-league-cup/startseite/wettbewerb/JAPC") : -1]

In [None]:
# Словарь тиров лиг.
league_dict = {
    "top_5": top_5,
    "tier_1": tier_1,
    "tier_2": tier_2,
    "tier_3": tier_3,
    "tier_4": tier_4,
    "tier_5": tier_5,
    "tier_6": tier_6,
    "youth_league": youth_league,
    "domestic_cup": domestic_cup,
    "domestic_super_cup": domestic_super_cup,
    "play_offs": play_offs,
    "league_cup": league_cup,
}

In [None]:
# Сбор ссылок игроков по раздельным файлам.
errs = []
for leagues in (league_dict):
  print(f"\n\n{leagues}:")
  club_links = []
  for league in league_dict[leagues]:
    try:
      club_links.append(get_clubs_from_league(league))
    except:
      print(f"FILE: {leagues}; SKIPPED LEAGUE: {league}")
      errs.append(f"FILE: {leagues}; SKIPPED LEAGUE: {league}")
  club_merged_links = [link for sublist in club_links for link in sublist]
  player_links = []
  for club in club_merged_links:
    try:
      player_links.append(get_players_from_club(club))
    except:
      print(f"FILE: {leagues}; SKIPPED CLUB: {club}")
      errs.append(f"FILE: {leagues}; SKIPPED CLUB: {club}")

  player_merged_links = [link for sublist in player_links for link in sublist]
  pd.DataFrame(player_merged_links, columns=['link']).to_csv(f"{leagues}_links.csv")



top_5:


tier_1:


tier_2:


tier_3:
FILE: tier_3; SKIPPED CLUB: https://www.transfermarkt.com/shanghai-port-b/startseite/verein/115122/saison_id/2023
FILE: tier_3; SKIPPED CLUB: https://www.transfermarkt.com/shandong-taishan-b/startseite/verein/115121/saison_id/2023


tier_4:


tier_5:


tier_6:
FILE: tier_6; SKIPPED CLUB: https://www.transfermarkt.com/nihon-university-fc/startseite/verein/114643/saison_id/2023
FILE: tier_6; SKIPPED CLUB: https://www.transfermarkt.com/yuri-club-akita-/startseite/verein/93642/saison_id/2023
FILE: tier_6; SKIPPED CLUB: https://www.transfermarkt.com/nakaniida-sc/startseite/verein/88151/saison_id/2023


youth_league:
FILE: youth_league; SKIPPED CLUB: https://www.transfermarkt.com/teikyo-nagaoka-high-school/startseite/verein/38432/saison_id/2023


domestic_cup:
FILE: domestic_cup; SKIPPED LEAGUE: https://www.transfermarkt.com/kings-cup/startseite/wettbewerb/SAKC
FILE: domestic_cup; SKIPPED LEAGUE: https://www.transfermarkt.com/uae-presidents-cup/startsei