In [315]:
import json

import pandas as pd
import requests
from bs4 import BeautifulSoup

# league_id = ("PL1", "A1", "SLO1", "UKR1")
league_id = ("PL1",)

# clubs = []

In [331]:
class ScrapeFromTM:
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0"}

    def __init__(self, leagues: tuple):
        self.df = None
        self.leagues = leagues
        self.clubs = []
        self.data = {
            'name': [],
            'date of birth': [],
            'height': [],
            'foot': [],
            'joined': [],
            'contract until': [],
        }

    def get_team_urls(self):
        for league in self.leagues:
            url = "https://www.transfermarkt.co.uk/quickselect/teams/" + league
            print(url)
            response = requests.request("GET", url, headers=ScrapeFromTM.headers)
            # print(response.text)
            for l in json.loads(response.text):
                self.clubs.append("https://transfermarkt.co.uk" + l["link"])

    def get_player_stats(self):
        for club in self.clubs:
            url = club.replace("startseite", "kader") + "/plus/1"
            response = requests.get(url, headers=ScrapeFromTM.headers)
            soup = BeautifulSoup(response.content, 'html.parser')

            all_tr = soup.find_all('tr', {'class': ['odd', 'even']})

            for row in all_tr:
                all_td = row.find_all('td', recursive=False)

                self.data['name'].append(all_td[1].text.split('.')[0][:-1])
                self.data['date of birth'].append(all_td[2].text[:-5])
                self.data['height'].append(all_td[4].text)
                self.data['foot'].append(all_td[5].text)
                self.data['joined'].append(all_td[6].text)
                self.data['contract until'].append(all_td[8].text)

    def create_dataset(self):
        self.df = pd.DataFrame(self.data)

    def clean_data(self):
        self.df["position"] = self.df["name"].apply(lambda x: ScrapeFromTM.clean_name_col(x)[1])
        self.df["name"] = self.df["name"].apply(lambda x: ScrapeFromTM.clean_name_col(x)[0])
        self.df["height"] = self.df["height"].apply(lambda x: float(x[:3].replace(",", ".")) if x != "-" else None)

    def save_to_csv_file(self, filename):
        self.df.to_csv(f"{filename}")

    @staticmethod
    def clean_name_col(x):
        return x.replace('\n', '').strip().split("    ")[0], x.replace('\n', '').strip().split("    ")[-1]

In [332]:
new_scrape = ScrapeFromTM(league_id)
new_scrape.get_team_urls()

https://www.transfermarkt.co.uk/quickselect/teams/PL1


In [333]:
new_scrape.clubs

['https://transfermarkt.co.uk/lech-poznan/startseite/verein/238',
 'https://transfermarkt.co.uk/rakow-czestochowa/startseite/verein/9644',
 'https://transfermarkt.co.uk/pogon-szczecin/startseite/verein/324',
 'https://transfermarkt.co.uk/lechia-gdansk/startseite/verein/4000',
 'https://transfermarkt.co.uk/piast-gliwice/startseite/verein/6112',
 'https://transfermarkt.co.uk/wisla-plock/startseite/verein/2118',
 'https://transfermarkt.co.uk/radomiak-radom/startseite/verein/7154',
 'https://transfermarkt.co.uk/gornik-zabrze/startseite/verein/428',
 'https://transfermarkt.co.uk/cracovia/startseite/verein/5689',
 'https://transfermarkt.co.uk/legia-warszawa/startseite/verein/255',
 'https://transfermarkt.co.uk/warta-poznan/startseite/verein/7146',
 'https://transfermarkt.co.uk/jagiellonia-bialystok/startseite/verein/2300',
 'https://transfermarkt.co.uk/zaglebie-lubin/startseite/verein/168',
 'https://transfermarkt.co.uk/stal-mielec/startseite/verein/22431',
 'https://transfermarkt.co.uk/slas

In [334]:
new_scrape.get_player_stats()

In [335]:
new_scrape.create_dataset()
new_scrape.clean_data()

In [336]:
new_scrape.df

Unnamed: 0,name,date of birth,height,foot,joined,contract until,position
0,Filip Bednarek,"Sep 26, 1992",1.8,right,"Jul 1, 2020","Jun 30, 2025",Goalkeeper
1,Dominik Holec,"Jul 28, 1994",1.9,right,"Jan 3, 2023","Jun 30, 2023",Goalkeeper
2,Artur Rudko,"May 7, 1992",1.9,right,"Jul 2, 2022","Jun 30, 2023",Goalkeeper
3,Antonio Milic,"Mar 10, 1994",1.9,left,"Jan 10, 2021","Jun 30, 2026",Centre-Back
4,Filip Dagerstal,"Feb 1, 1997",1.8,right,"Jul 26, 2022","Jun 30, 2023",Centre-Back
...,...,...,...,...,...,...,...
543,Jacek Podgorski,"Jun 23, 1996",1.8,right,"Aug 11, 2020","Jun 30, 2024",Right Winger
544,Jacek Kielb,"Jan 10, 1988",1.8,right,"Feb 6, 2020","Jun 30, 2024",Right Winger
545,Evgeniy Shikavka,"Oct 15, 1992",1.8,right,"Jan 26, 2022","Jun 30, 2023",Centre-Forward
546,Kacper Kostorz,"Aug 21, 1999",1.9,both,"Feb 17, 2023","Jun 30, 2023",Centre-Forward


In [343]:
new_scrape.save_to_csv_file("alpha.csv")


In [341]:
new_scrape.df["name"][5].replace("\xa0", "")

'Lubomir Satka'

In [342]:
new_scrape.df["name"][5]

'Lubomir Satka\xa0'

In [None]:
# def get_players_of_certain_club(url):
#     payload = ""
#     headers = {
#         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0"}
#
#     response = requests.request("GET", url, data=payload, headers=headers)
#     soup = BeautifulSoup(response.text)
#
#     club_players = []
#     try:
#         for player in soup.find_all("td", class_="rechts hauptlink"):
#             if player.a is not None:
#                 club_players.append(str(player.a.get("href")).replace("marktwertverlauf", "profil"))
#     except:
#         print("something's is lacking")
#     return club_players

In [None]:
# for club_url in clubs:
#     print(get_players_of_certain_club(club_url))
#     with open(f"./players.txt", "a+") as f:
#         f.write(";".join(get_players_of_certain_club(club_url)))

In [None]:
# url = 'https://www.transfermarkt.co.uk/afonso-sousa/profil/spieler/375378'
#
# headers = {
#     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0"}
#
# response = requests.request("GET", url, headers=headers)
# soup = BeautifulSoup(response.text)

In [None]:
# soup.find_all("span", class_="info-table__content info-table__content--regular")[1].text

In [None]:
# soup.find_all("span", class_="info-table__content info-table__content--bold")[1].text

In [None]:
# player_bio = dict()
# for pd in range(1, soup.find_all("span", class_="info-table__content info-table__content--bold").__len__()):
#     player_bio[f'{soup.find_all("span", class_="info-table__content info-table__content--regular")[pd].text}'] = \
#         soup.find_all("span", class_="info-table__content info-table__content--bold")[pd].text

In [None]:
# player_bio