In [1]:
import requests
from bs4 import BeautifulSoup
import jsonb
import re
import pandas as pd
from itables import show
import json


def get_parsed_html(url: str):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None


def extract_players_data(soup):
    try:
        scripts = soup.find_all("script")
        for script in scripts:
            if "var playersData" in script.text:
                match = re.search(
                    r"var playersData\s*=\s*JSON\.parse\((.*?)\);",
                    script.text,
                    re.DOTALL,
                )
                if match:
                    json_data = match.group(1)
                    clean_data = (
                        json_data.strip("'").encode("utf-8").decode("unicode_escape")
                    )
                    players_data = json.loads(clean_data)
                    return players_data
        print("playersData not found in scripts.")
        return None
    except Exception as e:
        print(f"Error extracting playersData: {e}")
        return None


url = "https://understat.com/league/EPL/2024"
parsed_html = get_parsed_html(url)

if parsed_html:
    players_data = extract_players_data(parsed_html)
    if players_data:
        print("Extracted playersData:")
        print(players_data)
    else:
        print("No playersData found.")

ModuleNotFoundError: No module named 'jsonb'

In [None]:
import requests
from bs4 import BeautifulSoup
import jsonb
import re
import pandas as pd
from itables import show


def get_parsed_html(url: str):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        return soup
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None


def extract_players_data(soup):
    try:
        scripts = soup.find_all("script")
        for script in scripts:
            if "var playersData" in script.text:
                match = re.search(
                    r"var playersData\s*=\s*JSON\.parse\((.*?)\);",
                    script.text,
                    re.DOTALL,
                )
                if match:
                    json_data = match.group(1)
                    clean_data = (
                        json_data.strip("'").encode("utf-8").decode("unicode_escape")
                    )
                    players_data = json.loads(clean_data)
                    return players_data
        print("playersData not found in scripts.")
        return None
    except Exception as e:
        print(f"Error extracting playersData: {e}")
        return None


url = "https://understat.com/league/EPL/2024"
parsed_html = get_parsed_html(url)

if parsed_html:
    players_data = extract_players_data(parsed_html)
    if players_data:
        print("Extracted playersData:")
        print(players_data)
    else:
        print("No playersData found.")

understat_leagues = ["EPL", "La_liga", "Bundesliga", "Serie_A", "Ligue_1", "RFPL"]


def get_understat_url(league_name, year="2024"):
    """
    Constructs the Understat league URL based on the league name.

    Args:
        league_name (str): The name of the league (e.g., "EPL", "La_liga").
        year (str): The season year (default is "2024").

    Returns:
        str: The constructed URL for the given league and year.
    """
    base_url = "https://understat.com/league/"
    if league_name not in understat_leagues:
        return f"League name '{league_name}' is not valid. Available options: {', '.join(understat_leagues)}"
    return f"{base_url}{league_name}/{year}"


df = pd.DataFrame(players_data)

numerical_columns = [
    "games",
    "time",
    "goals",
    "xG",
    "assists",
    "xA",
    "shots",
    "key_passes",
    "yellow_cards",
    "red_cards",
    "npg",
    "npxG",
    "xGChain",
    "xGBuildup",
]
df[numerical_columns] = df[numerical_columns].apply(pd.to_numeric)

features = ["goals", "shots", "xG", "xA", "npg", "npxG", "xGChain", "xGBuildup"]
X = df[features]

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
df["Cluster"] = kmeans.fit_predict(X_scaled)

cluster_means = df.groupby("Cluster")[features].mean()
print("Cluster Means:\n", cluster_means)


def assign_cluster_labels(cluster_means):
    labels = {}
    sorted_clusters = cluster_means[["goals", "xA", "xGBuildup"]].idxmax()
    labels[sorted_clusters["goals"]] = "High Scorers"
    labels[sorted_clusters["xA"]] = "Playmakers"
    labels[sorted_clusters["xGBuildup"]] = "Defenders"
    for cluster in cluster_means.index:
        if cluster not in labels:
            labels[cluster] = "All-Rounders"
    return labels


cluster_labels = assign_cluster_labels(cluster_means)

df["Cluster Label"] = df["Cluster"].map(cluster_labels)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df["PCA1"] = X_pca[:, 0]
df["PCA2"] = X_pca[:, 1]

fig = px.scatter(
    df,
    x="PCA1",
    y="PCA2",
    color="Cluster Label",
    title="Enhanced Interactive Player Clusters",
    hover_data=[
        "player_name",
        "team_title",
        "goals",
        "assists",
        "xG",
        "xA",
        "shots",
        "Cluster Label",
    ],
)

fig.update_layout(
    legend_title="Cluster Description",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    xaxis_title="PCA Component 1",
    yaxis_title="PCA Component 2",
)

fig.show()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist


def find_similar_players(player_name, df, X_scaled, top_n=5):
    if player_name not in df["player_name"].values:
        return f"Player {player_name} not found in the dataset."

    player_idx = df[df["player_name"] == player_name].index[0]

    distances = cdist([X_scaled[player_idx]], X_scaled, metric="euclidean")[0]
    print(distances)

    similar_idx = distances.argsort()[1 : top_n + 1]

    similar_players = df.iloc[similar_idx][["player_name", "team_title"]]
    similar_players["Similarity Score"] = distances[similar_idx]
    return similar_players


target_player = "Erling Haaland"
similar_players = find_similar_players(target_player, df, X_scaled, top_n=2)
print(f"Top 5 players similar to {target_player}:\n")
print(similar_players)

In [None]:
show(df)