In [45]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
from itables import show


def get_parsed_html(url: str) -> BeautifulSoup:
    """
    Fetches the content at the specified URL and returns a BeautifulSoup object.

    Args:
        url (str): The URL of the page to fetch.

    Returns:
        BeautifulSoup: Parsed HTML content of the page.
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 "
            "(Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 "
            "(KHTML, like Gecko) "
            "Chrome/85.0.4183.121 "
            "Safari/537.36"
        )
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None


def extract_players_data(soup: BeautifulSoup) -> list:
    """
    Extracts players data from the parsed HTML content.

    Args:
        soup (BeautifulSoup): Parsed HTML content of the page.

    Returns:
        list: List of players data dictionaries.
    """
    try:
        scripts = soup.find_all("script")
        pattern = re.compile(
            r"var playersData\s*=\s*JSON\.parse\('([^']+)'\);", re.MULTILINE | re.DOTALL
        )
        for script in scripts:
            if script.string and "var playersData" in script.string:
                match = pattern.search(script.string)
                if match:
                    json_text = match.group(1)
                    json_text = json_text.encode("utf-8").decode("unicode_escape")
                    players_data = json.loads(json_text)
                    return players_data
        print("playersData not found in scripts.")
        return None
    except Exception as e:
        print(f"Error extracting playersData: {e}")
        return None


def get_understat_url(league_name: str, year: str = "2024") -> str:
    """
    Constructs the Understat league URL based on the league name.

    Args:
        league_name (str): The name of the league (e.g., "EPL", "La_liga").
        year (str): The season year (default is "2024").

    Returns:
        str: The constructed URL for the given league and year.
    """
    understat_leagues = ["EPL", "La_liga", "Bundesliga", "Serie_A", "Ligue_1", "RFPL"]
    base_url = "https://understat.com/league/"
    if league_name not in understat_leagues:
        raise ValueError(
            f"League name '{league_name}' is not valid. Available options: {', '.join(understat_leagues)}"
        )
    return f"{base_url}{league_name}/{year}"


def get_league_players_data(league_name: str, year: str = "2024") -> list:
    """
    Fetches and returns players data for a given league and year.

    Args:
        league_name (str): The name of the league.
        year (str): The season year.

    Returns:
        list: List of players data dictionaries.
    """
    url = get_understat_url(league_name, year)
    parsed_html = get_parsed_html(url)
    if parsed_html:
        players_data = extract_players_data(parsed_html)
        if players_data:
            return players_data
        else:
            print(f"No players data found for {league_name} {year}.")
            return []
    else:
        print(f"Failed to get HTML for {league_name} {year}.")
        return []


def process_players_data(
    players_data: list, league_name: str, year: str
) -> pd.DataFrame:
    """
    Processes players data and returns a DataFrame.

    Args:
        players_data (list): List of players data dictionaries.
        league_name (str): Name of the league.
        year (str): Season year.

    Returns:
        pd.DataFrame: DataFrame containing processed players data.
    """
    df = pd.DataFrame(players_data)
    numerical_columns = [
        "games",
        "time",
        "goals",
        "xG",
        "assists",
        "xA",
        "shots",
        "key_passes",
        "yellow_cards",
        "red_cards",
        "npg",
        "npxG",
        "xGChain",
        "xGBuildup",
    ]
    for column in numerical_columns:
        if column in df.columns:
            df[column] = pd.to_numeric(df[column], errors="coerce")
    df["league"] = league_name
    df["year"] = year
    return df


understat_leagues = ["EPL", "La_liga", "Bundesliga", "Serie_A", "Ligue_1", "RFPL"]
year = "2024"

all_players_data = []
for league in understat_leagues:
    print(f"Fetching data for {league} {year}")
    players_data = get_league_players_data(league, year)
    if players_data:
        df_league = process_players_data(players_data, league, year)
        all_players_data.append(df_league)

if all_players_data:
    df_all = pd.concat(all_players_data, ignore_index=True)
    print("Combined DataFrame shape:", df_all.shape)
    show(df_all)
else:
    print("No data collected.")
df_all.to_csv("understat_all_players_data")

Fetching data for EPL 2024
Fetching data for La_liga 2024
Fetching data for Bundesliga 2024
Fetching data for Serie_A 2024
Fetching data for Ligue_1 2024
Fetching data for RFPL 2024
Combined DataFrame shape: (2680, 20)


id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup,league,year
Loading ITables v2.2.3 from the internet... (need help?),,,,,,,,,,,,,,,,,,,


In [16]:
show(df_all)

id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup,league,year
Loading ITables v2.2.3 from the internet... (need help?),,,,,,,,,,,,,,,,,,,


In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
import plotly.express as px


class PlayerClusterer:
    def __init__(self, n_clusters=4):
        self.n_clusters = n_clusters
        self.scaler = StandardScaler()
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42)
        self.pca = PCA(n_components=2)
        self.features = None
        self.X_scaled = None
        self.df = None

    def fit(self, df, features):
        """
        Fits the clustering model to the data and prepares it for plotting and similarity search.

        Args:
            df (pd.DataFrame): DataFrame containing player data.
            features (list): List of feature names to use for clustering.
        """
        self.df = df.copy()
        self.features = features
        X = self.df[self.features]
        self.X_scaled = self.scaler.fit_transform(X)

        self.df["Cluster"] = self.kmeans.fit_predict(self.X_scaled)

        cluster_means = self.df.groupby("Cluster")[self.features].mean()
        print("Cluster Means:\n", cluster_means)

        self.df["Cluster Label"] = self.df["Cluster"].map(
            self.assign_cluster_labels(cluster_means)
        )

        X_pca = self.pca.fit_transform(self.X_scaled)
        self.df["PCA1"] = X_pca[:, 0]
        self.df["PCA2"] = X_pca[:, 1]

    def assign_cluster_labels(self, cluster_means):
        """
        Assigns descriptive labels to each cluster based on feature importance.

        Args:
            cluster_means (pd.DataFrame): Mean values of features for each cluster.

        Returns:
            dict: Mapping from cluster number to cluster label.
        """
        labels = {}
        sorted_clusters = cluster_means[["goals", "xA", "xGBuildup"]].idxmax()
        labels[sorted_clusters["goals"]] = "High Scorers"
        labels[sorted_clusters["xA"]] = "Playmakers"
        labels[sorted_clusters["xGBuildup"]] = "Defenders"
        for cluster in cluster_means.index:
            if cluster not in labels:
                labels[cluster] = "All-Rounders"
        return labels

    def plot_clustered_players(self):
        """
        Creates and displays an interactive scatter plot of the clustered players.
        """
        fig = px.scatter(
            self.df,
            x="PCA1",
            y="PCA2",
            color="Cluster Label",
            title="Enhanced Interactive Player Clusters",
            hover_data=[
                "player_name",
                "team_title",
                "goals",
                "assists",
                "xG",
                "xA",
                "shots",
            ],
        )

        fig.update_layout(
            legend_title="Cluster Description",
            legend=dict(
                orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
            ),
            xaxis_title="PCA Component 1",
            yaxis_title="PCA Component 2",
        )

        fig.show()

    def find_similar_players(self, player_name, top_n=5):
        """
        Finds the most similar players to the given player based on selected features.

        Args:
            player_name (str): Name of the player.
            top_n (int): Number of similar players to return.

        Returns:
            pd.DataFrame: DataFrame containing similar players and their similarity scores.
        """
        if player_name not in self.df["player_name"].values:
            return f"Player {player_name} not found in the dataset."

        player_idx = self.df[self.df["player_name"] == player_name].index[0]

        distances = cdist(
            [self.X_scaled[player_idx]], self.X_scaled, metric="euclidean"
        )[0]

        similar_idx = distances.argsort()
        similar_idx = similar_idx[similar_idx != player_idx]
        similar_idx = similar_idx[:top_n]

        similar_players = self.df.iloc[similar_idx][
            ["player_name", "team_title"]
        ].copy()
        similar_players["Similarity Score"] = distances[similar_idx]
        similar_players.reset_index(drop=True, inplace=True)
        return similar_players


features = [
    "games",
    "time",
    "goals",
    "xG",
    "assists",
    "xA",
    "shots",
    "key_passes",
    "yellow_cards",
    "red_cards",
    "npg",
    "npxG",
    "xGChain",
    "xGBuildup",
]

clusterer = PlayerClusterer(n_clusters=4)

clusterer.fit(df_all, features)
clusterer.plot_clustered_players()

similar_players = clusterer.find_similar_players("Raphinha", top_n=5)
print(similar_players)

Cluster Means:
              games        time     goals        xG   assists        xA  \
Cluster                                                                  
0        10.741339  756.325635  1.524249  1.609451  1.362587  1.483879   
1         9.861598  681.431774  0.384990  0.513087  0.334308  0.408619   
2        11.195122  866.837398  5.121951  4.901010  1.861789  1.854786   
3         4.117486  164.657559  0.115665  0.166740  0.102004  0.117129   

             shots  key_passes  yellow_cards  red_cards       npg      npxG  \
Cluster                                                                       
0        13.625866   12.182448      1.457275   0.066975  1.381062  1.460848   
1         5.433723    4.263158      1.598441   0.072125  0.356725  0.487342   
2        28.731707   13.609756      1.178862   0.032520  4.601626  4.366640   
3         1.765938    1.186703      0.358834   0.027322  0.109290  0.159186   

          xGChain  xGBuildup  
Cluster                       
0 

       player_name           team_title  Similarity Score
0     Lamine Yamal            Barcelona          6.037056
1    Mohamed Salah            Liverpool          6.136701
2      Cole Palmer              Chelsea          6.465039
3  Ousmane Dembélé  Paris Saint Germain          6.760086
4    Omar Marmoush  Eintracht Frankfurt          7.204738


In [34]:
import pandas as pd

numerical_columns = [
    "games",
    "time",
    "goals",
    "xG",
    "assists",
    "xA",
    "shots",
    "key_passes",
    "yellow_cards",
    "red_cards",
    "npg",
    "npxG",
    "xGChain",
    "xGBuildup",
]

categories = {
    "Shooting": ["goals", "xG", "npg", "npxG", "shots"],
    "Passing": ["assists", "xA", "key_passes"],
    "Possession": ["xGChain", "xGBuildup"],
    "Discipline": ["yellow_cards", "red_cards"],
}

for col in numerical_columns:
    rank_col = col + "_rank"
    df_all[rank_col] = df_all[col].rank(ascending=False, method="min")


def get_player_performance_table(player_name, df, categories):
    """
    Generates a performance table of a player's ranks in various statistical categories.

    Args:
        player_name (str): The name of the player.
        df (pd.DataFrame): The DataFrame containing player data.
        categories (dict): A dictionary mapping categories to statistical columns.

    Returns:
        pd.DataFrame: A DataFrame containing the player's ranks and values organized by category.
    """
    if player_name not in df["player_name"].values:
        print(f"Player '{player_name}' not found in the dataset.")
        return None
    player_data = df[df["player_name"] == player_name].iloc[0]
    category_tables = []

    for category, stats in categories.items():
        data = []
        for stat in stats:
            stat_value = player_data.get(stat, None)
            stat_rank = player_data.get(f"{stat}_rank", None)
            if pd.notnull(stat_value) and pd.notnull(stat_rank):
                data.append(
                    {
                        "Category": category,
                        "Stat": stat.replace("_", " ").title(),
                        "Value": stat_value,
                        "Rank": int(stat_rank),
                    }
                )
        if data:
            category_df = pd.DataFrame(data)
            category_tables.append(category_df)
    if category_tables:
        performance_table = pd.concat(category_tables, ignore_index=True)
        return performance_table
    else:
        print(f"No data available for player '{player_name}'.")
        return None


player_name = "Erling Haaland"
performance_table = get_player_performance_table(player_name, df_all, categories)
if performance_table is not None:
    print(f"Performance Table for {player_name}:\n")
    show(performance_table)

player_name = "Bukayo Saka"
performance_table = get_player_performance_table(player_name, df_all, categories)
if performance_table is not None:
    print(f"\nPerformance Table for {player_name}:\n")
    show(performance_table)

Performance Table for Erling Haaland:



Category,Stat,Value,Rank
Loading ITables v2.2.3 from the internet... (need help?),,,



Performance Table for Bukayo Saka:



Category,Stat,Value,Rank
Loading ITables v2.2.3 from the internet... (need help?),,,


In [28]:
show(df_all[df_all["league"] == "La_liga"])

Unnamed: 0,id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup,league,year
Loading ITables v2.2.3 from the internet... (need help?),,,,,,,,,,,,,,,,,,,,


In [38]:
import pandas as pd
import plotly.graph_objects as go

numerical_columns = [
    "games",
    "time",
    "goals",
    "xG",
    "assists",
    "xA",
    "shots",
    "key_passes",
    "yellow_cards",
    "red_cards",
    "npg",
    "npxG",
    "xGChain",
    "xGBuildup",
]

radar_categories = {
    "Shooting": ["goals", "shots", "xG"],
    "Passing": ["assists", "key_passes", "xA"],
    "Possession": ["xGChain", "xGBuildup"],
}

for col in numerical_columns:
    pct_rank_col = col + "_pct_rank"
    df_all[pct_rank_col] = df_all[col].rank(pct=True, ascending=True) * 100


def plot_player_radar(player_name, df, radar_categories):
    """
    Generates a radar chart for a player's performance across selected categories.

    Args:
        player_name (str): The name of the player.
        df (pd.DataFrame): The DataFrame containing player data.
        radar_categories (dict): Dictionary mapping categories to statistical columns.
    """
    if player_name not in df["player_name"].values:
        print(f"Player '{player_name}' not found in the dataset.")
        return

    player_data = df[df["player_name"] == player_name].iloc[0]

    categories = []
    values = []
    for category, stats in radar_categories.items():
        for stat in stats:
            pct_rank_col = stat + "_pct_rank"
            stat_value = player_data.get(pct_rank_col, None)
            if pd.notnull(stat_value):
                categories.append(f"{stat.replace('_', ' ').title()} ({category})")
                values.append(stat_value)

    categories += [categories[0]]
    values += [values[0]]

    fig = go.Figure(
        data=go.Scatterpolar(
            r=values, theta=categories, fill="toself", name=player_name
        )
    )

    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 100])),
        showlegend=False,
        title=f"Performance Radar Chart for {player_name}",
    )

    fig.show()


player_name = "Bukayo Saka"
plot_player_radar(player_name, df_all, radar_categories)

In [43]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


metrics = [
    "goals",
    "xG",
    "assists",
    "xA",
    "shots",
    "key_passes",
    "xGChain",
    "xGBuildup",
]


df_valuation = df_all.copy()


df_valuation[metrics] = df_valuation[metrics].fillna(0)


scaler = MinMaxScaler()


df_valuation[metrics] = scaler.fit_transform(df_valuation[metrics])

weights = {
    "goals": 0.25,
    "xG": 0.15,
    "assists": 0.20,
    "xA": 0.10,
    "shots": 0.10,
    "key_passes": 0.10,
    "xGChain": 0.05,
    "xGBuildup": 0.05,
}

df_valuation["PVI"] = 0
for metric, weight in weights.items():
    df_valuation["PVI"] += df_valuation[metric] * weight

pvi_min = df_valuation["PVI"].min()
pvi_max = df_valuation["PVI"].max()
df_valuation["PVI_normalized"] = (df_valuation["PVI"] - pvi_min) / (pvi_max - pvi_min)

min_value = 1e6
max_value = 1e8

df_valuation["Estimated_Value"] = (
    df_valuation["PVI_normalized"] * (max_value - min_value) + min_value
)

df_valuation = df_valuation.sort_values("Estimated_Value", ascending=False)

top_players = df_valuation[
    ["player_name", "team_title", "Estimated_Value", "PVI_normalized"]
].head(10)
print("Top 10 Players by Estimated Value:")
print(top_players)

top_player_value = df_valuation["Estimated_Value"].iloc[0]
print(f"\nTop player's estimated value: €{top_player_value:,.2f}")

Top 10 Players by Estimated Value:
             player_name           team_title  Estimated_Value  PVI_normalized
457             Raphinha            Barcelona     1.000000e+08        1.000000
945        Omar Marmoush  Eintracht Frankfurt     9.858293e+07        0.985686
944           Harry Kane        Bayern Munich     9.347132e+07        0.934054
454   Robert Lewandowski            Barcelona     9.184692e+07        0.917646
2277     Alexey Batrakov     Lokomotiv Moscow     8.983227e+07        0.897296
1          Mohamed Salah            Liverpool     8.666549e+07        0.865308
463         Lamine Yamal            Barcelona     8.584501e+07        0.857020
2293      Esequiel Barco       Spartak Moscow     8.320239e+07        0.830327
4            Cole Palmer              Chelsea     7.977548e+07        0.795712
1346       Mateo Retegui             Atalanta     7.731549e+07        0.770864

Top player's estimated value: €100,000,000.00
