In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
from itables import show
from typing import Tuple, Optional, List


class TransferMarketScraper:
    def __init__(self):
        self.base_url = "https://www.transfermarkt.co.uk"
        self.leagues = {
            "Premier League": {"url": "/premier-league/startseite/wettbewerb/GB1"},
            "La Liga": {"url": "/laliga/startseite/wettbewerb/ES1"},
            "Serie A": {"url": "/serie-a/startseite/wettbewerb/IT1"},
            "Ligue 1": {"url": "/ligue-1/startseite/wettbewerb/FR1"},
            "Bundesliga": {"url": "/bundesliga/startseite/wettbewerb/L1"},
        }
        self.session = requests.Session()
        self.session.headers.update(
            {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/85.0.4183.121 Safari/537.36"
            }
        )
        self.position_classes = [
            "zentriert rueckennummer bg_Torwart",  # Goalkeeper
            "zentriert rueckennummer bg_Abwehr",  # Defender
            "zentriert rueckennummer bg_Mittelfeld",  # Midfielder
            "zentriert rueckennummer bg_Sturm",  # Forward
        ]
        self.all_clubs_df = pd.DataFrame()
        self.all_players_df = pd.DataFrame()

    def get_parsed_html(self, url: str) -> Optional[BeautifulSoup]:
        try:
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            return soup
        except requests.exceptions.RequestException as e:
            print(f"Error fetching the page: {e}")
            return None

    def get_league_club_data(self, url: str) -> pd.DataFrame:
        soup = self.get_parsed_html(url)
        if not soup:
            return pd.DataFrame()

        table = soup.find("table", class_="items")
        if not table:
            print(f"No table found on the page: {url}")
            return pd.DataFrame()

        tbody = table.find("tbody")
        if not tbody:
            print(f"No tbody found in the table: {url}")
            return pd.DataFrame()

        rows = tbody.find_all("tr", recursive=False)
        data = []
        for row in rows:
            cols = row.find_all("td")
            if len(cols) < 7:
                continue

            club_cell = cols[1]
            club_link_tag = club_cell.find("a", href=True)
            club_name = club_cell.get_text(strip=True)
            club_url = club_link_tag["href"] if club_link_tag else None

            row_data = {
                "Club": club_name,
                "Club URL": club_url,
                "Squad": cols[2].get_text(strip=True),
                "Average Age": cols[3].get_text(strip=True),
                "Foreigners": cols[4].get_text(strip=True),
                "Average Market Value": cols[5].get_text(strip=True),
                "Total Market Value": cols[6].get_text(strip=True),
            }
            data.append(row_data)
        club_data = pd.DataFrame(data)
        return club_data

    def convert_market_value(self, value: str) -> Tuple[Optional[str], Optional[float]]:
        """Converts market value string to numeric value."""
        try:
            value = value.strip()
            if not value:
                return None, None
            currency_type = value[0]  # First character is the currency symbol
            numeric_part = value[1:].replace(",", "")
            if "m" in numeric_part.lower():
                market_value = float(numeric_part.strip("m")) * 1_000_000
            elif "k" in numeric_part.lower():
                market_value = float(numeric_part.strip("k")) * 1_000
            else:
                market_value = float(numeric_part)
            return currency_type, market_value
        except (ValueError, AttributeError, IndexError):
            return None, None

    def _get_club_players_information(self, url: str) -> pd.DataFrame:
        soup = self.get_parsed_html(url)
        if not soup:
            return pd.DataFrame()

        table = soup.find("table", class_="items")
        if not table:
            print(f"No table found on the page: {url}")
            return pd.DataFrame()

        tbody = table.find("tbody")
        if not tbody:
            print(f"No tbody found in the table: {url}")
            return pd.DataFrame()

        rows = tbody.find_all("tr", recursive=False)
        players_data = []
        for row in rows:
            player_type = None
            for position_class in self.position_classes:
                position_td = row.find("td", class_=position_class)
                if position_td:
                    player_type = position_td.get("title", "").strip()
                    break
            if not player_type:
                continue

            name_cell = row.find("td", class_="hauptlink")
            if not name_cell:
                continue
            player_link_tag = name_cell.find("a", href=True)
            player_name = player_link_tag.text.strip() if player_link_tag else None
            player_link = player_link_tag["href"] if player_link_tag else None
            if not player_name or not player_link:
                continue

            dob_cells = row.find_all("td", class_="zentriert")
            if len(dob_cells) >= 2:
                dob_text = dob_cells[1].get_text(strip=True)
                date_part = dob_text.split("(", 1)
                date_str = date_part[0].strip()
                age = date_part[1].rstrip(")") if len(date_part) > 1 else None
                try:
                    date_iso = datetime.strptime(date_str, "%b %d, %Y").strftime(
                        "%Y-%m-%d"
                    )
                except ValueError:
                    date_iso = None
            else:
                date_iso = None
                age = None

            market_value_cell = row.find("td", class_="rechts hauptlink")
            if market_value_cell:
                player_market_value_ref = market_value_cell.find("a", href=True)
                player_market_value_ref = (
                    player_market_value_ref["href"] if player_market_value_ref else None
                )
                player_market_text = market_value_cell.get_text(strip=True)
                currency_type, market_value = self.convert_market_value(
                    player_market_text
                )
            else:
                player_market_value_ref = None
                currency_type = None
                market_value = None

            player_data = {
                "Type": player_type,
                "Name": player_name,
                "Link": player_link,
                "Date of Birth": date_iso,
                "Age": age,
                "Market Value Reference": player_market_value_ref,
                "Currency Type": currency_type,
                "Market Value": market_value,
            }
            players_data.append(player_data)
        players_df = pd.DataFrame(players_data)
        return players_df

    def get_all_club_player_data(self, club_data: pd.DataFrame) -> pd.DataFrame:
        all_players_data = []
        for index, row in club_data.iterrows():
            time.sleep(1)
            club_name = row["Club"]
            club_url = row["Club URL"]
            league = row["League"]
            if not club_url:
                print(f"No URL for club {club_name}")
                continue
            club_request_url = self.base_url + club_url
            print(
                f"Fetching data for club: {club_name} ({league}) - {club_request_url}"
            )
            players_df = self._get_club_players_information(url=club_request_url)
            if not players_df.empty:
                players_df["Club"] = club_name
                players_df["League"] = league
                all_players_data.append(players_df)
        if all_players_data:
            all_players_df = pd.concat(all_players_data, ignore_index=True)
            return all_players_df
        else:
            return pd.DataFrame()

    def run(self):
        all_clubs_data = []
        for league_name, data in self.leagues.items():
            time.sleep(1)
            league_url = self.base_url + data["url"]
            print(f"Fetching data for league: {league_name} - {league_url}")
            club_data = self.get_league_club_data(league_url)
            if not club_data.empty:
                club_data["League"] = league_name
                all_clubs_data.append(club_data)
        if all_clubs_data:
            self.all_clubs_df = pd.concat(all_clubs_data, ignore_index=True)
        else:
            print("No club data collected.")
            return
        show(self.all_clubs_df)

        self.all_players_df = self.get_all_club_player_data(self.all_clubs_df)
        if not self.all_players_df.empty:
            show(self.all_players_df)
        else:
            print("No player data collected.")


scraper = TransferMarketScraper()
scraper.run()
all_clubs_df = scraper.all_clubs_df
all_players_df = scraper.all_players_df

Fetching data for league: Premier League - https://www.transfermarkt.co.uk/premier-league/startseite/wettbewerb/GB1
Fetching data for league: La Liga - https://www.transfermarkt.co.uk/laliga/startseite/wettbewerb/ES1
Fetching data for league: Serie A - https://www.transfermarkt.co.uk/serie-a/startseite/wettbewerb/IT1
Fetching data for league: Ligue 1 - https://www.transfermarkt.co.uk/ligue-1/startseite/wettbewerb/FR1
Fetching data for league: Bundesliga - https://www.transfermarkt.co.uk/bundesliga/startseite/wettbewerb/L1


Club,Club URL,Squad,Average Age,Foreigners,Average Market Value,Total Market Value,League
Loading ITables v2.2.3 from the internet... (need help?),,,,,,,


Fetching data for club: Manchester City (Premier League) - https://www.transfermarkt.co.uk/manchester-city/startseite/verein/281/saison_id/2024
35.00m versus 35000000.0
35.00m versus 35000000.0
9.00m versus 9000000.0
9.00m versus 9000000.0
80.00m versus 80000000.0
80.00m versus 80000000.0
45.00m versus 45000000.0
45.00m versus 45000000.0
40.00m versus 40000000.0
40.00m versus 40000000.0
38.00m versus 38000000.0
38.00m versus 38000000.0
75.00m versus 75000000.0
75.00m versus 75000000.0
4.00m versus 4000000.0
4.00m versus 4000000.0
40.00m versus 40000000.0
40.00m versus 40000000.0
13.00m versus 13000000.0
13.00m versus 13000000.0
130.00m versus 130000000.0
130.00m versus 130000000.0
40.00m versus 40000000.0
40.00m versus 40000000.0
30.00m versus 30000000.0
30.00m versus 30000000.0
12.00m versus 12000000.0
12.00m versus 12000000.0
70.00m versus 70000000.0
70.00m versus 70000000.0
45.00m versus 45000000.0
45.00m versus 45000000.0
12.00m versus 12000000.0
12.00m versus 12000000.0
65.00m ver

Type,Name,Link,Date of Birth,Age,Market Value Reference,Currency Type,Market Value,Club,League
Loading ITables v2.2.3 from the internet... (need help?),,,,,,,,,


In [3]:
show(all_players_df[all_players_df["Club"] == "Manchester City"])

Unnamed: 0,Type,Name,Link,Date of Birth,Age,Market Value Reference,Currency Type,Market Value,Club,League
Loading ITables v2.2.3 from the internet... (need help?),,,,,,,,,,


In [4]:
all_players_df.columns

Index(['Type', 'Name', 'Link', 'Date of Birth', 'Age',
       'Market Value Reference', 'Currency Type', 'Market Value', 'Club',
       'League'],
      dtype='object')

In [10]:
all_players_df["Type"].unique()

array(['Goalkeeper', 'Defender', 'Midfield', 'Attack'], dtype=object)

In [None]:
import plotly.graph_objects as go
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from scipy.optimize import curve_fit
import numpy as np
import pandas as pd


def run_selected_models(data, age_column, value_column, position_column=None, position_filter=None):
    """
    Runs Neural Network, Polynomial Regression, and Sigmoid (Logistic) Function on the filtered data using Plotly.

    Args:
        data (pd.DataFrame): Data containing 'Age' and 'Market Value' columns.
        age_column (str): Column name for age.
        value_column (str): Column name for market values.
        position_column (str, optional): Column name for player position. Defaults to None.
        position_filter (str, optional): Position to filter for analysis. Required if `position_column` is provided.

    Returns:
        None (displays the plots using Plotly).
    """
    if position_column and position_filter:
        data = data[data[position_column] == position_filter]

    data = data.dropna(subset=[age_column, value_column])

    data[age_column] = pd.to_numeric(data[age_column], errors='coerce')
    data[value_column] = pd.to_numeric(data[value_column], errors='coerce')

    data = data.dropna(subset=[age_column, value_column])

    data = data.sort_values(by=age_column)
    x = data[age_column].values.reshape(-1, 1).astype(float)
    y = data[value_column].values.astype(float)


    if len(data) < 2:
        print("Not enough data points after filtering.")
        return


    fig = go.Figure()

    try:
        nn_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=0)
        nn_model.fit(x, y)
        y_nn_pred = nn_model.predict(x)
        fig.add_trace(go.Scatter(x=x.flatten(), y=y, mode='markers', name='Data Points (Neural Network)'))
        fig.add_trace(go.Scatter(x=x.flatten(), y=y_nn_pred, mode='lines', name='Neural Network Regression', line=dict(color='red')))
    except ValueError as e:
        print(f"Neural Network Error: {e}")

    try:
        poly = PolynomialFeatures(degree=2)
        x_poly = poly.fit_transform(x)
        model_poly = LinearRegression().fit(x_poly, y)
        y_poly_pred = model_poly.predict(x_poly)
        fig.add_trace(go.Scatter(x=x.flatten(), y=y, mode='markers', name='Data Points (Polynomial Regression)'))
        fig.add_trace(go.Scatter(x=x.flatten(), y=y_poly_pred, mode='lines', name='Polynomial Regression (Degree 2)', line=dict(color='green')))
    except ValueError as e:
        print(f"Polynomial Regression Error: {e}")

    def sigmoid(x, L, x0, k):
        return L / (1 + np.exp(-k * (x - x0)))

    try:
        p0 = [max(y), np.median(x), 0.1]
        popt, _ = curve_fit(sigmoid, x.flatten(), y, p0=p0, maxfev=10000)
        y_sigmoid_pred = sigmoid(x.flatten(), *popt)
        fig.add_trace(go.Scatter(x=x.flatten(), y=y, mode='markers', name='Data Points (Sigmoid Function)'))
        fig.add_trace(go.Scatter(x=x.flatten(), y=y_sigmoid_pred, mode='lines', name='Sigmoid (Logistic) Function', line=dict(color='blue')))
    except (ValueError, RuntimeError, TypeError) as e:
        print(f"Sigmoid Function Error: {e}")

    # Update layout
    fig.update_layout(
        title=f"Regression Models on Age vs Market Value{' (Filtered by Type)' if position_filter else ''}",
        xaxis_title=age_column,
        yaxis_title=value_column,
        legend_title="Models",
        template="plotly_white"
    )

    fig.update_layout(
    width=1500
    height=600
)

    # Show the figure
    fig.show()

run_selected_models(
    all_players_df, 
    age_column='Age', 
    value_column='Market Value', 
    position_column='Type', 
    position_filter='Attack'
)


Stochastic Optimizer: Maximum iterations (1000) reached and the optimization hasn't converged yet.

