In [4]:
### this is a revised notebook to cluster attackers based on distance (closest to each other in terms of performance)
### spatial distance is used to perform distance based clustring instead of K-means and DBSCAN used in feature_selection.ipynb |

### Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

### Data

In [3]:
df_attacking = pd.read_csv("Stefano_test/data/attacking_data.csv")
df_attempts = pd.read_csv("Stefano_test/data/attempts_data.csv")
df_defending = pd.read_csv("Stefano_test/data/defending_data.csv")
df_disciplinary = pd.read_csv("Stefano_test/data/disciplinary_data.csv")
df_distribution = pd.read_csv("Stefano_test/data/distribution_data.csv")
df_goalkeeping = pd.read_csv("Stefano_test/data/goalkeeping_data.csv")
df_goals = pd.read_csv("Stefano_test/data/goals_data.csv")
df_key_stats = pd.read_csv("Stefano_test/data/key_stats_data.csv")
df_players = pd.read_csv("Stefano_test/data/players_data.csv")
df_teams = pd.read_csv("Stefano_test/data/teams_data.csv")
df_attackers_val = pd.read_csv('Player_Valuation_CSVs/attacker_valuations.csv')
df_defenders_val = pd.read_csv('Player_Valuation_CSVs/defender_valuations.csv')
df_goalkeepers_val = pd.read_csv('Player_Valuation_CSVs/goalkeeper_valuations.csv')
df_midfielders_val = pd.read_csv('Player_Valuation_CSVs/midfielder_valuations.csv')
df_all = pd.read_csv('all_players_with_valuations.csv')

### Merging data

In [4]:
df_all.head()

Unnamed: 0,id_player,player_name,nationality,field_position,id_team,assists,corners_taken,offsides,dribbles,total_attempts,...,other,penalties_scored,distance_covered(km/h),top_speed,minutes_played,matches_appareance,Position,Age,Market Value,Club Name
0,250016833,harry kane,England,Forward,50037,1.0,0.0,3.0,10.0,20.0,...,0.0,3.0,43.71,30.35,360.0,4.0,Centre-Forward,31.0,€150.00m,Bayern Munich
1,250105927,viktor gyokeres,Sweden,Forward,50149,0.0,0.0,3.0,6.0,15.0,...,0.0,2.0,41.94,34.55,360.0,4.0,Centre-Forward,26.0,€75.00m,Sporting CP
2,250121533,vinicius junior,Brazil,Forward,50051,0.0,7.0,0.0,34.0,17.0,...,0.0,1.0,38.39,35.47,360.0,4.0,Left Winger,24.0,€200.00m,Real Madrid
3,250121294,tijjani reijnders,Netherlands,Midfielder,50058,0.0,0.0,1.0,7.0,8.0,...,0.0,0.0,46.61,32.26,360.0,4.0,Central Midfield,26.0,€50.00m,AC Milan
4,250160436,maghnes akliouche,France,Midfielder,50023,1.0,9.0,2.0,10.0,10.0,...,0.0,0.0,44.67,33.39,360.0,4.0,Right Winger,22.0,€40.00m,AS Monaco


In [6]:
# Filtering the data by field_positoin for Forward players only
attackers_df = df_all[df_all['field_position'] == 'Forward']
attackers_df

Unnamed: 0,id_player,player_name,nationality,field_position,id_team,assists,corners_taken,offsides,dribbles,total_attempts,...,other,penalties_scored,distance_covered(km/h),top_speed,minutes_played,matches_appareance,Position,Age,Market Value,Club Name
0,250016833,harry kane,England,Forward,50037,1.0,0.0,3.0,10.0,20.0,...,0.0,3.0,43.71,30.35,360.0,4.0,Centre-Forward,31.0,€150.00m,Bayern Munich
1,250105927,viktor gyokeres,Sweden,Forward,50149,0.0,0.0,3.0,6.0,15.0,...,0.0,2.0,41.94,34.55,360.0,4.0,Centre-Forward,26.0,€75.00m,Sporting CP
2,250121533,vinicius junior,Brazil,Forward,50051,0.0,7.0,0.0,34.0,17.0,...,0.0,1.0,38.39,35.47,360.0,4.0,Left Winger,24.0,€200.00m,Real Madrid
18,250134304,gabriel martinelli,Brazil,Forward,52280,1.0,4.0,0.0,8.0,8.0,...,0.0,0.0,43.12,34.39,360.0,4.0,Left Winger,23.0,€85.00m,Arsenal FC
92,250130406,johan bakayoko,Belgium,Forward,50062,1.0,0.0,1.0,14.0,13.0,...,0.0,0.0,42.93,34.52,352.0,4.0,Right Winger,21.0,€45.00m,PSV Eindhoven
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,250138623,indrit tuci,Albania,Forward,50033,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.88,27.54,12.0,1.0,Centre-Forward,24.0,€1.00m,AC Sparta Prague
638,250180810,jarzinho malanga,Germany,Forward,50107,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.83,27.91,10.0,1.0,Left Winger,18.0,€1.40m,VfB Stuttgart
642,250137395,samuel mbangula,Belgium,Forward,50139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.94,29.33,9.0,1.0,Left Winger,21.0,€8.00m,Juventus FC
647,250172594,petar ratkov,Serbia,Forward,50030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.23,25.67,7.0,1.0,Centre-Forward,21.0,€6.00m,Red Bull Salzburg


In [7]:
# Feature selection
attackers_features = ['id_player', 'player_name', 'goals', 'top_speed', 'assists', 'dribbles', 'attempts_on_target', 'total_attempts', 'right_foot',
                     'left_foot', 'head', 'inside_area', 'outside_area']

# DataFrame with attackers features
attackers_ft_df = attackers_df[attackers_features].copy()
attackers_ft_df

# goals = df_goals[['id_player','goals']].copy()
# top_speed = df_key_stats[['id_player','top_speed']].copy()
# assists_dribbles = df_attacking[['id_player','assists','dribbles']].copy()
# attempts = df_attempts[['id_player','attempts_on_target', 'total_attempts']].copy()
# minutes = df_key_stats[['id_player','minutes_played']].copy()
# name = df_players[['id_player', 'player_name']].copy()

Unnamed: 0,id_player,player_name,goals,top_speed,assists,dribbles,attempts_on_target,total_attempts,right_foot,left_foot,head,inside_area,outside_area
0,250016833,harry kane,5.0,30.35,1.0,10.0,12.0,20.0,5.0,0.0,0.0,5.0,0.0
1,250105927,viktor gyokeres,5.0,34.55,0.0,6.0,8.0,15.0,5.0,0.0,0.0,5.0,0.0
2,250121533,vinicius junior,4.0,35.47,0.0,34.0,10.0,17.0,2.0,2.0,0.0,3.0,1.0
18,250134304,gabriel martinelli,0.0,34.39,1.0,8.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0
92,250130406,johan bakayoko,1.0,34.52,1.0,14.0,5.0,13.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,250138623,indrit tuci,0.0,27.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,250180810,jarzinho malanga,0.0,27.91,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
642,250137395,samuel mbangula,0.0,29.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
647,250172594,petar ratkov,0.0,25.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
