# Problem Statement
- This is FIFA-20 dataset with 18,000+ rows & 100+ columns.
- Using clustering algorithms to categorize similar players using their traits and their different skills in the field.

- Importing the libraries

In [None]:
#import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import Counter
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
from matplotlib import rcParams

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Loading the dataset
data=pd.read_csv("../input/fifa-2020-player-dataset/players_20.csv",parse_dates=True)

# EDA

In [None]:
data.head(2)

In [None]:
data.shape

In [None]:
data.columns

- As this is a football players dataset, we have a mix of all kinds of players and goal keepers. But we cannot analyse a goalkpeer in the same way as that of a player. So let us drop some columns and segregate the goal keepers from other players for separate analysis.

In [None]:
data.drop(['player_url','sofifa_id','long_name','team_jersey_number','loaned_from','nation_jersey_number', \
           'player_tags','nation_position','team_jersey_number','player_positions',\
          'goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes'], axis = 1, inplace = True)

- We can drop the player_positions column as it has too many tags attached to each player. Let's keep the team positions column as it has only one tag attached to each player.

In [None]:
data['age'].value_counts().head()

In [None]:
plt.figure(figsize=(15,5))
plt.title('Age Distribution of goal-keepers in fifa 20', fontsize=20)
sns.distplot(a=data['age'], kde=True, bins=20)

In [None]:
plt.figure(figsize= (15,5))

ax = sns.countplot(x='age', data=data, palette='bright')
ax.set_title(label='Count of Players on Basis of Age in FIFA 20', fontsize=20)

ax.set_xlabel(xlabel='Age', fontsize=10)
ax.set_ylabel(ylabel='Count', fontsize=10)

plt.show()

- The age of maximum players lie between 20 to 27 years with a mean of around 25 years¶
- The minimum age of the players in the game is 16 years whereas the maximum age is 42 year

In [None]:
data['nationality'].value_counts().head()

In [None]:
plt.figure(figsize= (15, 7))

ax = sns.countplot(x='nationality', data=data, palette=None, order=data.nationality.value_counts().iloc[:10].index)
ax.set_title(label='Count of Players on Basis of Nationality in FIFA 20', fontsize=20)

ax.set_xlabel(xlabel='Nationality', fontsize=16)
ax.set_ylabel(ylabel='Count', fontsize=16)
plt.xticks(rotation=30, fontsize=12)

plt.show()

In [None]:
data['height_cm'].value_counts()

In [None]:
plt.figure(figsize=(15,5))
plt.title('Height Distribution of goal-keepers in fifa 20', fontsize=20)
sns.distplot(a=data['height_cm'], kde=True, bins=20)

In [None]:
df = data.filter(['club','overall'])

In [None]:
df = df[df.overall>=80]
df =df['club'].value_counts()
df=df.head(25)
rcParams['figure.figsize']= 10,6
with plt.style.context(('ggplot')):df.plot(kind='bar',color = 'green')
plt.title(' players from Top 20 club ')
plt.xlabel('Club')
plt.ylabel('Number of players')

In [None]:
data['preferred_foot'].value_counts()

In [None]:
ax = sns.countplot(x='preferred_foot', data=data, palette=None)
ax.set_title(label='Count of Players on Basis of preferred_foot in FIFA 20', fontsize=20)

# Analyze the data for goal-keepers

In [None]:
data['team_position'].isnull().sum()

In [None]:
data['team_position'].fillna(data['team_position'].mode()[0],inplace=True)

In [None]:
data['team_position'].isnull().sum()

In [None]:
# For goalkeepers
data_gk=data.loc[data['team_position'] == "GK"]
data_gk.head(2)

In [None]:
data_oth=data.loc[data['team_position']!="GK"]
data_oth.head(2)

In [None]:
data_gk_null=data_gk.isnull().sum()
data_gk_null

- Primary traits of a goal keeper are speed, positioning, reflexes, diving and handling. Goal keepers are not much associated with pace, shooting, passing,dribbling or defending. So let us drop some of these columns which don't hold much significance in goalkeeping and have null values,i.e approx 3.6%

In [None]:
data_gk=data_gk.drop(['dob','pace','shooting','passing','dribbling','defending','physic','player_traits',\
                      'ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm',\
                      'cm','rcm','rm','lwb','ldm', 'cdm','rdm','rwb','lb','lcb','cb','rcb','rb'],axis=1)
data_gk.head(2)

In [None]:
data_gk.shape

In [None]:
data_gk['overall'].value_counts().head()

In [None]:
sns.distplot(a=data_gk['overall'], kde=True, bins=20)

- The goal keepers have overall value in the range 50-90, with a mean of 69.

In [None]:
data_gk['wage_eur'].value_counts().head()

In [None]:
sns.distplot(a=data_gk['wage_eur'], kde=True)

- The wage seems relatively less about 50K euros for the goalkeepers. The players may have higher wages. 

In [None]:
rcParams['figure.figsize']= 12,10
sns.lineplot('potential' ,'gk_diving', data=data_gk,legend='brief',ci=None).set_title('players potential with respect to gk skills')
sns.lineplot('potential','gk_handling',data=data_gk,legend='brief',ci=None)
sns.lineplot('potential','gk_kicking',data=data_gk,legend='brief',ci=None)
sns.lineplot('potential','gk_reflexes',data=data_gk,legend='brief',ci=None)
sns.lineplot('potential','gk_speed',data=data_gk,legend='brief',ci=None)
sns.lineplot('potential','gk_positioning',data=data_gk,legend='brief',ci=None).set_ylabel('gk_diving, gk_handling, gk_kicking , gk_positioning, gk_speed and gk_reflexes')
plt.legend(['gk_diving','gk_handling','gk_kicking','gk_reflexes','gk_positioning','gk_speed'])
plt.show()

- The handling and reflexes are higher for all the goal keepers, compared to other traits.

# Analysis for players

In [None]:
# For other players
data_oth=data.loc[data['team_position'] != "GK"]
data_oth.head(2)

- Drop the insignificant columns which have NaN values

In [None]:
data_oth.isnull().sum()

In [None]:
data_oth=data_oth.drop(['dob','work_rate','real_face','gk_diving','gk_handling','gk_kicking','gk_reflexes','gk_speed',\
                     'gk_positioning','player_traits'],axis=1)
data_oth.head(5)

In [None]:
data1=data_oth
data1
data1.head()

In [None]:
data1['joined'].fillna(0,inplace=True)
data1['contract_valid_until'].fillna(0,inplace=True)

In [None]:
data1['joined']= pd.to_datetime(data1['joined'], format='%Y-%m-%d',errors='coerce')

In [None]:
data1['contract_valid_until'] =data1['contract_valid_until'].astype(int)
data1['contract_valid_until'].head()

# Bivariate analysis

In [None]:
# Age vs Overall
avp = sns.lineplot(data1['age'], data1['overall'], palette = 'Wistia')
plt.title('Age vs Overall', fontsize = 20)

plt.show()

- In the graph of Overall of the players, we observed that the players are improving with the step of the years. The best players are 30 years old and after that they start to decline. It is seen that in the 40 there is a peak in the graph again, this may be due to the veterans included in the games.

In [None]:
# Age vs potential
avp = sns.lineplot(data1['age'], data1['potential'], palette = 'Wistia')
plt.title('Age vs Potential', fontsize = 20)
plt.show()

- The potential for players in mid age from 20-30 has almost remained the same with slight variation, & eventually players above 35 have higher potentials, which gradully reduced with increase in age 

In [None]:
data1['wage_eur'].head()

In [None]:
data1['value_eur'].head()

In [None]:
sns.scatterplot(x= 'overall' ,y='value_eur', data=data1, color = 'orange')

In [None]:
data1['release_clause_eur'].head()

In [None]:
data1['release_clause_eur'].isnull().sum()

In [None]:
data1['release_clause_eur']=data1['release_clause_eur'].fillna((data1['release_clause_eur'].mean()))

In [None]:
data1['release_clause_eur'] =data1['release_clause_eur'].astype(int)

In [None]:
data_val=data1.loc[:,['value_eur','release_clause_eur']]
data_val.head(10)

In [None]:
data_val.head(10).plot(kind = 'bar', figsize=(20,6))
plt.title('Comparision of release_clause vs value in euro for each player')
plt.xlabel('players')
plt.ylabel('value in euros billions')

- The release clause value is always higher than the value of each player.

In [None]:
data1['international_reputation'].value_counts()

In [None]:
x=data1['wage_eur']
y=data1['international_reputation']
plt.scatter(x, y)
plt.show()

- The plot gives a general idea that players with higher reputation have higher wages,and these two columns have a positive correlation

In [None]:
sns.scatterplot(x= 'age' ,y='wage_eur', data=data1,hue=None)

- The wage is distributed in the mid range with players in the age 20-34 having major amount of wages, it eventually declined with higher age players. Also young players who are considered as beginners also don't have major wages. 

In [None]:
data1['pace'] = data1['pace'].fillna((data1['pace'].mean()))
data1['shooting'] = data1['shooting'].fillna((data1['shooting'].mean()))
data1['passing'] = data1['passing'].fillna((data1['passing'].mean()))
data1['dribbling'] = data1['dribbling'].fillna((data1['dribbling'].mean()))
data1['defending'] = data1['defending'].fillna((data1['defending'].mean()))
data1['physic'] = data1['physic'].fillna((data1['physic'].mean()))

In [None]:
df = data1.loc[:,['short_name','defending','physic','pace','shooting','dribbling','passing']]
df.head()

In [None]:
rcParams['figure.figsize']= 12,10
sns.lineplot('potential' ,'defending', data=data1,legend='brief',ci=None).set_title('players potential with respect to skills')
sns.lineplot('potential','pace',data=data1,legend='brief',ci=None)
sns.lineplot('potential','shooting',data=data1,legend='brief',ci=None)
sns.lineplot('potential','passing',data=data1,legend='brief',ci=None)
sns.lineplot('potential','dribbling',data=data1,legend='brief',ci=None)
sns.lineplot('potential','physic',data=data1,legend='brief',ci=None).set_ylabel('defending, pace,shooting,passing,dribbling and physic')
plt.legend(['defending','pace','shooting','passing','dribbling','physic'])
plt.show()

- We can analyze players have steady increase in traits like dribbling,pace & shooting above the potential of 60, and less defending skills. 

- Also we can observe that the players are in different positions in team_position, so let us create another position that can be one for the following five:
- Reserve
- Substitute
- Defender
- Midfielder
- Forward

In [None]:
def label_pos (row):
    if(row.team_position == "RB" or row.team_position == "CB" or row.team_position == "LB" or 
       row.team_position == "LWB" or row.team_position == "RCB"or row.team_position == "RWB" or 
       row.team_position == "LCB"):
        return 'Defender'
    if(row.team_position == "RW" or row.team_position == "CF" or row.team_position == "LW" or 
       row.team_position == "ST" or row.team_position == "RS"or row.team_position == "LS"
       or row.team_position == "LF"or row.team_position == "RF"):
        return 'Forward'
    if(row.team_position == "RM" or row.team_position == "CM" or row.team_position == "LM"
       or row.team_position == "CAM" or row.team_position == "LDM" or row.team_position == "RDM"
      or row.team_position == "LAM"or row.team_position == "RAM" or row.team_position == "CDM"
       or row.team_position == "RCM"or row.team_position == "LCM"):
        return 'MidFielder'
    if (row.team_position == "SUB"):
        return 'Substitute'
    if (row.team_position == "RES"):
        return 'Reserve'
    return "other"

data1['field_position'] = data1.apply (lambda row: label_pos(row), axis=1)

- Let us analyze the main characteristics like pace,shooting,passing,dribbling,defending,physic & overall according to each position.

In [None]:
ratingsByPositions = data1.groupby([data1['field_position']]).mean()
ratingsByPositions = ratingsByPositions
ratingsByPositions = ratingsByPositions[['pace','shooting','passing','dribbling','defending','physic','overall']]
ratingsByPositions.plot(kind='barh', figsize=(10,15))

plt.title('Ratings Average By Position',fontsize=20)
plt.xlabel('Rating',fontsize=16)
plt.ylabel('Position',fontsize=16);

- We can see that the traits change for each player wrt to the position in which they are playing. The mid-fielder and forward have higher shooting and dribbling whereas the defender has more defending trait compared to other players.

In [None]:
by_pos = data1.groupby(data1['field_position']).count().age

by_pos.plot(kind='pie',figsize=(7,7), colors=['green','red','blue','orange','pink','grey'], labels=None, autopct='%1.1f%%', fontsize=16)

plt.legend(labels=data1['field_position'].unique())
plt.title('Positions', fontsize=16)
plt.ylabel('')

In [None]:
%matplotlib inline
y=data1[data1["overall"]>50]
plt.scatter(y["overall"],y["wage_eur"],color="green")

- The wages are significantly higher for players with overall above 90 

In [None]:
rcParams['figure.figsize']= 10,6
sns.lineplot('potential' ,'skill_moves', data=data1,legend='brief').set_title('player skill_moves and weak_foot')
sns.lineplot('potential','weak_foot',data=data1,legend='brief').set_ylabel('skill moves/weak foot')
plt.legend(['skill_moves','weak_foot'])
plt.show()

- Players with more skill_moves are stable as compared to players who have weak_foot

In [None]:
data1_sw = data1.loc[:,['skill_moves','weak_foot']]
data1_sw = data1_sw.head(15)
data1_sw.plot(kind = 'barh', figsize=(10,6) )
plt.title('Top 20 player skill_moves and weak_foot')
plt.xlabel('skill move and weak foot')

- The above plot denotes weather players are able to balance their skill moves with weak_foot.

In [None]:
x=data1['wage_eur']
y=data1['preferred_foot']
plt.scatter(x, y)
plt.show()

- We can asses from the above plot that although more players are Right footed, but the left footed players have higher wage in euros. That can be due to their potential in playing. Further analysis is needed to determine this factor

In [None]:
sns.catplot(y="body_type",
            x="wage_eur",data=data1,
           height=6,kind="bar",aspect=2)
plt.title("Bodytype vs wage in eur")

- The wages are significantly higher for players like Messi, Neymar and Ronaldo irrespective of their body type.

In [None]:
columns = ['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm', 'cdm','rdm','rwb','lb','lcb','cb','rcb','rb']

In [None]:
for col in columns:
     data1[col]=data1[col].str.split('+',n=1,expand = True)[0]        
data1[['ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm', 'cdm','rdm','rwb','lb','lcb','cb','rcb','rb']]=data1[columns]
data1[columns] = data1[columns].fillna(0)
data1.head()

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
enc=LabelEncoder()
data1.loc[:,['nationality','club','body_type','team_position','field_position','preferred_foot']]=data1.loc[:,['nationality','club','body_type','team_position','field_position','preferred_foot']].apply(enc.fit_transform)
data1.head(2)

In [None]:
data1['overall'].min()

# Clustering analysis

- Let us do K-means clustering to cluster the players based on their traits and different skills 

In [None]:
data1.columns

In [None]:
data1=data1[['short_name','age','overall','value_eur','international_reputation','skill_moves',\
             'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',\
             'field_position']]

In [None]:
data1 = data1[data1.overall > 50]

In [None]:
names = data1.short_name.tolist()

In [None]:
data1 = data1.drop(['short_name'], axis = 1) # drop the short_name column
data1.head()

In [None]:
sns.pairplot(data1,hue ='international_reputation' )

In [None]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

In [None]:
from sklearn import preprocessing
x = data1.values # numpy array
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)
X_norm = pd.DataFrame(x_scaled)

In [None]:
#K means to determine K
distortions = []
K=range(1,10)
for k in K:
    kmeanModel=KMeans(n_clusters=k)
    kmeanModel.fit(x)
    distortions.append(sum(np.min(cdist(x,kmeanModel.cluster_centers_,'euclidean'),axis=1))/x.shape[0])

In [None]:
distortions

In [None]:
#Plot
fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.set_xticks(range(1,25))
ax.set_xlabel('K-Number of clusters')
ax.set_ylabel('Distortion')
ax.set_title('The elbow method showing optimal K')
ax.plot(K,distortions,marker='x')
plt.show()

In [None]:
km =KMeans(n_clusters = 3)
km
y_predicted=km.fit_predict(data1[['age','overall','value_eur','field_position','international_reputation',\
                                 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic',\
                                 'skill_moves']])
y_predicted

In [None]:
km.cluster_centers_

In [None]:
X = data1.values

In [None]:
plt.scatter(X[:, 2], X[:, 1],c=y_predicted.astype(np.float))
plt.xlabel('value_eur')
plt.ylabel('overall')

plt.show()

- The algorithm has classified the players into 3 clusters with yellow color clusters having more wages and overall increased traits. The teal color ones have overall ratings from 70-90 with mid range values in euros. The purple cluster players have overall from 50-80 and have lower value in euros. 