In [111]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "iframe"

import numpy as np

import math

from sklearn.cluster import KMeans

In [112]:
df = pd.read_csv("../data/players_data.csv")
df.columns

Index(['NAME', 'BIRTH_DATE', 'COUNTRY', 'TEAM', 'ID', 'OVERALL_RATING',
       'POTENTIAL', 'VALUE', 'WAGE', 'HEIGHT', 'KIT_NUMBER', 'PREFERRED_FOOT',
       'WEAK_FOOT', 'SKILL_MOVES', 'INTERNATIONAL_REPUTATION', 'REAL_FACE',
       'RELEASE_CLAUSE', 'CROSSING', 'FINISHING', 'HEADING_ACCURACY',
       'SHORT_PASSING', 'VOLLEYS', 'DRIBBLING', 'CURVE', 'FK_ACCURACY',
       'LONG_PASSING', 'BALL_CONTROL', 'ACCELERATION', 'SPRINT_SPEED',
       'AGILITY', 'REACTIONS', 'BALANCE', 'SHOT_POWER', 'JUMPING', 'STAMINA',
       'STRENGTH', 'LONG_SHOTS', 'AGGRESSION', 'INTERCEPTIONS', 'POSITIONING',
       'VISION', 'PENALTIES', 'COMPOSURE', 'DEFENSIVE_AWARENESS',
       'STANDING_TACKLE', 'SLIDING_TACKLE', 'GK_DIVING', 'GK_HANDLING',
       'GK_KICKING', 'GK_POSITIONING', 'GK_REFLEXES', 'POSITION'],
      dtype='object')

In [113]:
def clean_and_convert(column):
    numeric_part = column.str.extract(r'(\d*\.\d+|\d+)', expand=False)
    column_numeric = pd.to_numeric(numeric_part, errors='coerce') * column.str.extract(r'([Kk])', expand=False).fillna(1).replace({'K': 1e3, 'k': 1e3})    
    return column_numeric

df['WAGE'] = clean_and_convert(df['WAGE'])

df = df[df['WAGE'] != 0]

In [114]:
wage_and_overall = df.iloc[:, [5, 8]].values

In [115]:
standard_scaler = StandardScaler()
wage_and_overall = standard_scaler.fit_transform(wage_and_overall)

In [116]:
wcss = []
for i in range(1, 11):
  kmeans_players = KMeans(n_clusters=i, random_state=0)
  kmeans_players.fit(wage_and_overall)
  wcss.append(kmeans_players.inertia_)

wcss























[12006.000000000013,
 6368.440798721481,
 3722.513924068055,
 2483.0689736144777,
 1819.6700369959594,
 1427.5217282899553,
 1204.2770949482003,
 1039.148108845472,
 908.779245406969,
 789.5168042307035]

In [117]:
cluster_best_number_graph = px.line(x = range(1, 11), y = wcss)
cluster_best_number_graph.show()

In [118]:
kmeans_players = KMeans(n_clusters=6, random_state=0)
labels = kmeans_players.fit_predict(wage_and_overall)





In [119]:
groups_graph = px.scatter(x = wage_and_overall[:,0], y = wage_and_overall[:,1], color=labels)
groups_graph.show()

In [120]:
players_list = np.column_stack((df, labels))
players_list

array([['Manor Solomon', '24/07/1999', 'Israel', ..., 10.0, 'LM', 0],
       ['Marcus Rashford', '31/10/1997', 'England', ..., 14.0, 'LW', 3],
       ['Hugo Bueno López', '18/09/2002', 'Spain', ..., 11.0, 'LB', 0],
       ...,
       ['Kevin Restenedo', '29/02/1988', 'Ecuador', ..., 12.0, 'ST', 2],
       ['Fabian Pekruhl', '14/02/1999', 'Germany', ..., 57.0, 'GK', 5],
       ['Rayvien Rosario', '11/04/2004', 'Curacao', ..., 6.0, 'CAM', 5]],
      dtype=object)

In [121]:
players_list.shape

(6003, 53)

In [122]:
players_list = players_list[players_list[:, 52].argsort()]
players_list = players_list[:, [0, 5, 8, -1]]
players_list

array([['Manor Solomon', 77, 1000.0, 0],
       ['Robin Kwamina Quaison', 70, 18000.0, 0],
       ["Youssouf Yacoub M'Changama", 71, 12000.0, 0],
       ...,
       ['Darren Yapi', 56, 500.0, 5],
       ['Lex-Tyger Lobinger', 62, 2000.0, 5],
       ['Rayvien Rosario', 57, 650.0, 5]], dtype=object)

In [123]:
names = players_list[:, 0]
overall_ratings = players_list[:, 1].astype(float)
wages = players_list[:, 2].astype(float)

fig = px.scatter(x=overall_ratings, y=wages, text=names, labels={'x': 'Overall Rating', 'y': 'Wage'},
                 title='Scatter Plot of Overall Rating vs Wage',
                 template='plotly_white')

fig.update_traces(textposition='bottom right', textfont=dict(size=8))
fig.update_layout(width=1000, height=625)

fig.show()

In [124]:
bins = np.arange(math.floor(df['OVERALL_RATING'].min() / 5) * 5, 
                 math.ceil(df['OVERALL_RATING'].max() / 5) * 5 + 5, 
                 5)
labels = [f'{start}-{end}' for start, end in zip(bins[:-1], bins[1:])]
df['OVERALL_RANGE'] = pd.cut(df['OVERALL_RATING'], bins=bins, labels=labels, include_lowest=True, right=False)

df.to_csv('../data/players_dataset_with_fixed_wage_and_overall_ranges.csv', index=False)

df.head()

Unnamed: 0,NAME,BIRTH_DATE,COUNTRY,TEAM,ID,OVERALL_RATING,POTENTIAL,VALUE,WAGE,HEIGHT,...,DEFENSIVE_AWARENESS,STANDING_TACKLE,SLIDING_TACKLE,GK_DIVING,GK_HANDLING,GK_KICKING,GK_POSITIONING,GK_REFLEXES,POSITION,OVERALL_RANGE
0,Manor Solomon,24/07/1999,Israel,Fulham,246791,77,84,€21.5M,1000.0,170,...,50.0,47.0,35.0,12.0,11.0,9.0,13.0,10.0,LM,75-80
1,Marcus Rashford,31/10/1997,England,Manchester United,231677,85,88,€73.5M,170000.0,186,...,40.0,38.0,33.0,11.0,6.0,15.0,7.0,14.0,LW,85-90
2,Hugo Bueno López,18/09/2002,Spain,Wolverhampton Wanderers,261224,70,82,€3.6M,18000.0,180,...,64.0,66.0,64.0,9.0,12.0,13.0,6.0,11.0,LB,70-75
3,William Alain André Gabriel Saliba,24/03/2001,France,Arsenal,243715,83,89,€53M,73000.0,193,...,85.0,87.0,83.0,7.0,10.0,6.0,8.0,11.0,CB,80-85
4,Bukayo Saka,05/09/2001,England,Arsenal,246669,85,90,€80M,110000.0,178,...,65.0,69.0,67.0,7.0,8.0,13.0,9.0,13.0,RW,85-90
