In [18]:
import requests
from bs4 import BeautifulSoup
import jsonb
import re
import pandas as pd
from itables import show
import json

def get_parsed_html(url: str):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None

def extract_players_data(soup):
    try:
        scripts = soup.find_all('script')
        for script in scripts:
            if 'var playersData' in script.text:
                match = re.search(r"var playersData\s*=\s*JSON\.parse\((.*?)\);", script.text, re.DOTALL)
                if match:
                    json_data = match.group(1)
                    clean_data = json_data.strip("'").encode('utf-8').decode('unicode_escape')
                    players_data = json.loads(clean_data)
                    return players_data
        print("playersData not found in scripts.")
        return None
    except Exception as e:
        print(f"Error extracting playersData: {e}")
        return None

url = "https://understat.com/league/EPL/2024"
parsed_html = get_parsed_html(url)

if parsed_html:
    players_data = extract_players_data(parsed_html)
    if players_data:
        print("Extracted playersData:")
        print(players_data)
    else:
        print("No playersData found.")

Extracted playersData:
[{'id': '8260', 'player_name': 'Erling Haaland', 'games': '11', 'time': '990', 'goals': '12', 'xG': '12.19233638048172', 'assists': '0', 'xA': '0.8076825514435768', 'shots': '56', 'key_passes': '5', 'yellow_cards': '2', 'red_cards': '0', 'position': 'F', 'team_title': 'Manchester City', 'npg': '11', 'npxG': '11.431167542934418', 'xGChain': '9.736066937446594', 'xGBuildup': '1.0157683677971363'}, {'id': '1250', 'player_name': 'Mohamed Salah', 'games': '11', 'time': '970', 'goals': '8', 'xG': '6.820769935846329', 'assists': '6', 'xA': '4.119314484298229', 'shots': '33', 'key_passes': '21', 'yellow_cards': '0', 'red_cards': '0', 'position': 'M', 'team_title': 'Liverpool', 'npg': '6', 'npxG': '5.298432279378176', 'xGChain': '13.500556617975235', 'xGBuildup': '4.79445331171155'}, {'id': '4456', 'player_name': 'Chris Wood', 'games': '11', 'time': '925', 'goals': '8', 'xG': '5.410878539085388', 'assists': '0', 'xA': '0.9723502080887556', 'shots': '21', 'key_passes': '6'

In [None]:
import requests
from bs4 import BeautifulSoup
import jsonb
import re
import pandas as pd
from itables import show

def get_parsed_html(url: str):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None

def extract_players_data(soup):
    try:
        scripts = soup.find_all('script')
        for script in scripts:
            if 'var playersData' in script.text:
                match = re.search(r"var playersData\s*=\s*JSON\.parse\((.*?)\);", script.text, re.DOTALL)
                if match:
                    json_data = match.group(1)
                    clean_data = json_data.strip("'").encode('utf-8').decode('unicode_escape')
                    players_data = json.loads(clean_data)
                    return players_data
        print("playersData not found in scripts.")
        return None
    except Exception as e:
        print(f"Error extracting playersData: {e}")
        return None

url = "https://understat.com/league/EPL/2024"
parsed_html = get_parsed_html(url)

if parsed_html:
    players_data = extract_players_data(parsed_html)
    if players_data:
        print("Extracted playersData:")
        print(players_data)
    else:
        print("No playersData found.")

understat_leagues = [
    "EPL",
    "La_liga",
    "Bundesliga",
    "Serie_A",
    "Ligue_1",
    "RFPL"
]

def get_understat_url(league_name, year="2024"):
    """
    Constructs the Understat league URL based on the league name.

    Args:
        league_name (str): The name of the league (e.g., "EPL", "La_liga").
        year (str): The season year (default is "2024").

    Returns:
        str: The constructed URL for the given league and year.
    """
    base_url = "https://understat.com/league/"
    if league_name not in understat_leagues:
        return f"League name '{league_name}' is not valid. Available options: {', '.join(understat_leagues)}"
    return f"{base_url}{league_name}/{year}"

df = pd.DataFrame(players_data)

numerical_columns = ['games', 'time', 'goals', 'xG', 'assists', 'xA', 'shots', 
                     'key_passes', 'yellow_cards', 'red_cards', 'npg', 'npxG', 
                     'xGChain', 'xGBuildup']
df[numerical_columns] = df[numerical_columns].apply(pd.to_numeric)

features = ['goals', 'shots', 'xG', 'xA', 'npg', 'npxG', 'xGChain', 'xGBuildup']
X = df[features]

In [19]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)

cluster_means = df.groupby('Cluster')[features].mean()
print("Cluster Means:\n", cluster_means)

def assign_cluster_labels(cluster_means):
    labels = {}
    sorted_clusters = cluster_means[['goals', 'xA', 'xGBuildup']].idxmax()
    labels[sorted_clusters['goals']] = 'High Scorers'
    labels[sorted_clusters['xA']] = 'Playmakers'
    labels[sorted_clusters['xGBuildup']] = 'Defenders'
    for cluster in cluster_means.index:
        if cluster not in labels:
            labels[cluster] = 'All-Rounders'
    return labels

cluster_labels = assign_cluster_labels(cluster_means)

df['Cluster Label'] = df['Cluster'].map(cluster_labels)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]

fig = px.scatter(
    df, x='PCA1', y='PCA2', color='Cluster Label', 
    title="Enhanced Interactive Player Clusters",
    hover_data=['player_name', 'team_title', 'goals', 'assists', 'xG', 'xA', 'shots', 'Cluster Label']
)

fig.update_layout(
    legend_title="Cluster Description",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    xaxis_title="PCA Component 1",
    yaxis_title="PCA Component 2",
)

fig.show()


Cluster Means:
             goals      shots        xG        xA       npg      npxG  \
Cluster                                                                
0        0.112403   2.213178  0.209627  0.174752  0.112403  0.209627   
1        2.092593  16.703704  2.415188  1.591065  1.962963  2.274231   
2        5.454545  28.590909  5.232070  1.954902  5.090909  4.920683   
3        0.366667   7.091667  0.616203  0.915004  0.366667  0.616203   

          xGChain  xGBuildup  
Cluster                       
0        0.900113   0.645100  
1        5.030772   2.172732  
2        7.471729   1.919010  
3        3.939192   3.115098  


In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

def find_similar_players(player_name, df, X_scaled, top_n=5):
    if player_name not in df['player_name'].values:
        return f"Player {player_name} not found in the dataset."
    
    player_idx = df[df['player_name'] == player_name].index[0]
    
    distances = cdist([X_scaled[player_idx]], X_scaled, metric='euclidean')[0]
    print(distances)
    
    similar_idx = distances.argsort()[1:top_n+1]
    
    similar_players = df.iloc[similar_idx][['player_name', 'team_title']]
    similar_players['Similarity Score'] = distances[similar_idx]
    return similar_players

target_player = "Erling Haaland"
similar_players = find_similar_players(target_player, df, X_scaled, top_n=2)
print(f"Top 5 players similar to {target_player}:\n")
print(similar_players)

Top 5 players similar to Nicolas Jackson:

       player_name        team_title  Similarity Score
10   Ollie Watkins       Aston Villa          0.012453
0   Erling Haaland   Manchester City          0.014927
16  Alexander Isak  Newcastle United          0.019684
15     Kai Havertz           Arsenal          0.020653
5    Danny Welbeck          Brighton          0.022462


In [22]:
show(df)

id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup,Cluster,Cluster Label,PCA1,PCA2
Loading ITables v2.2.3 from the internet... (need help?),,,,,,,,,,,,,,,,,,,,,
