In [183]:
import numpy as np
import pandas as pd
import sklearn as sk

In [184]:
xg = pd.read_csv("xg.csv", encoding='ISO-8859-1')

In [185]:
xg.shape

(1810, 39)

In [186]:
passes = pd.read_csv("passes.csv", encoding='ISO-8859-1')

In [187]:
passes.shape

(1810, 41)

In [188]:
xg.columns

Index(['Rk', 'Player', 'xG/90', 'Season', 'Age', 'Nation', 'Team', 'Comp',
       'MP', 'Min', '90s', 'Starts', 'Subs', 'unSub', 'Gls', 'Ast', 'G+A',
       'G-PK', 'PK', 'PKatt', 'PKm', 'xG/90.1', 'npxG', 'xAG', 'xG+xAG', 'xA',
       'npxG+xAG', 'G-xG', 'np:G-xG', 'A-xAG', 'npxG/Sh', 'Sh', 'G/Sh',
       'G/SoT', 'SoT', 'SoT%', 'Dist', 'FK', 'Pos'],
      dtype='object')

In [189]:
passes.columns

Index(['Rk', 'Player', 'Cmp/90', 'Season', 'Age', 'Nation', 'Team', 'Comp',
       'MP', 'Min', '90s', 'Starts', 'Subs', 'unSub', 'Gls', 'Ast', 'G+A',
       'G-PK', 'PK', 'PKatt', 'PKm', 'TotalCmp/90', 'TotalAtt', 'TotalCmp%',
       'TotalKP', 'passesToFinalThird', 'PPA', 'CrsPA', 'PrgP', 'TotDist',
       'PrgDist', 'ShortCmp', 'ShortAtt', 'ShortCmp%', 'MedCmp', 'MedAtt',
       'MedCmp%', 'LongCmp', 'LongAtt', 'LongCmp%', 'Pos'],
      dtype='object')

In [190]:
passes_clean = passes.drop(columns=["Gls", "Ast", "G+A", "G-PK", "PK", "PKatt", "PKm"])

df = pd.merge(
    xg,
    passes_clean,
    on=["Player", "Season", "Team", "Comp", "Age", "Nation", "Pos"],
)
df.columns

Index(['Rk_x', 'Player', 'xG/90', 'Season', 'Age', 'Nation', 'Team', 'Comp',
       'MP_x', 'Min_x', '90s_x', 'Starts_x', 'Subs_x', 'unSub_x', 'Gls', 'Ast',
       'G+A', 'G-PK', 'PK', 'PKatt', 'PKm', 'xG/90.1', 'npxG', 'xAG', 'xG+xAG',
       'xA', 'npxG+xAG', 'G-xG', 'np:G-xG', 'A-xAG', 'npxG/Sh', 'Sh', 'G/Sh',
       'G/SoT', 'SoT', 'SoT%', 'Dist', 'FK', 'Pos', 'Rk_y', 'Cmp/90', 'MP_y',
       'Min_y', '90s_y', 'Starts_y', 'Subs_y', 'unSub_y', 'TotalCmp/90',
       'TotalAtt', 'TotalCmp%', 'TotalKP', 'passesToFinalThird', 'PPA',
       'CrsPA', 'PrgP', 'TotDist', 'PrgDist', 'ShortCmp', 'ShortAtt',
       'ShortCmp%', 'MedCmp', 'MedAtt', 'MedCmp%', 'LongCmp', 'LongAtt',
       'LongCmp%'],
      dtype='object')

In [191]:
features = [
    "Gls", "Ast", "G+A",                # output
    "xG/90", "npxG", "xAG", "xG+xAG", 
    "npxG+xAG", "Sh", "SoT", "Dist",   # xG stuff
    "Cmp/90", "TotalCmp/90", "TotalCmp%", "TotalKP", 
    "passesToFinalThird", "PPA", "CrsPA", "PrgP"   # passing
]
df = df.fillna(0)
X = df[features]

In [192]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [193]:
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(X_scaled)

In [195]:
def find_similar(player_name, n=5):
    idx = df.index[df["Player"] == player_name][0]
    player_vec = X_scaled[idx].reshape(1, -1)
    distances, indices = model.kneighbors(player_vec, n_neighbors=n+1)

    results = df.iloc[indices[0][1:]][["Player", "Team", "Season", "Pos", "Gls", "Ast", "G+A"]]
    results["SimilarityScore"] = 1 - distances[0][1:]
    return results

# Example
print(find_similar("Erling Haaland", n=5))

                Player        Team     Season    Pos  Gls  Ast  G+A  \
29          Moise Kean  Fiorentina  2024-2025     FW  0.6  0.1  0.7   
115      Abdallah Sima       Brest  2024-2025  FW,MF  0.5  0.1  0.7   
111     Borja Iglesias  Celta Vigo  2024-2025     FW  0.5  0.1  0.6   
18   Ermedin Demirovi?   Stuttgart  2024-2025     FW  0.7  0.0  0.8   
45   Randal Kolo Muani     2 Teams  2024-2025     FW  0.6  0.1  0.7   

     SimilarityScore  
29          0.991411  
115         0.984818  
111         0.982394  
18          0.982293  
45          0.980302  
