In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We will use the FIFA 20 Dataset out of the above given datasets.

# Reading the Data

In [None]:
dataset = pd.read_csv('/kaggle/input/fifa-20-complete-player-dataset/players_20.csv')
dataset.head()

# Analyzing and Cleaning the Dataset

The next thing we will do is finding the total rows and columns. 

In [None]:
dataset.shape # in the form of (rows,cols)

Now we will list all the columns. Doing so will help us decide which all columns to use and which ones to drop. This is will help us clean the dataset and make it easier for using the dataset by keeping only the relevant data.

In [None]:
print (list(dataset.columns))

Now we will try to drop as many columns as possible which won't be helpful for analysis.

In [None]:
drop_columns = ["sofifa_id","player_url","dob","long_name","nation_position","loaned_from","nation_jersey_number"]
dataset = dataset.drop(drop_columns, axis = 1)

**It might just happen at times that column names might have spaces in it, so we will check and remove these spaces, otherwise it may require special attention.**

In [None]:
dataset.columns = [c.replace(' ', '') for c in dataset.columns]

Now, I will check if the dataset has any null values or not.

In [None]:
dataset.isnull().sum()

Since we have a few columns like **lb,lcb,etc.**  which have null values, I will fill them with 0 as values, so as the dataset has no null values. 

In [None]:
dataset = dataset.fillna(0)
dataset.isnull().sum()

I will now be using the columns listed below for performing the EDA.

In [None]:
cleaned_dataset = dataset[['short_name', 'age', 'height_cm', 'weight_kg', 'nationality', 'club',
       'overall', 'potential', 'value_eur', 'wage_eur', 'player_positions',
       'preferred_foot', 'international_reputation', 'weak_foot',
       'skill_moves', 'work_rate', 'body_type', 'real_face',
       'release_clause_eur']]

Finally now that the data is cleaned, I will be using **Plotly** to perform data visualization.

# Visualizing the Data

In [None]:
import plotly
import plotly.graph_objects as go
import plotly.express as px

First of all, lets find the age distributions of all the players and also the top 10 nationalities of players.

In [None]:
fig = px.histogram(cleaned_dataset['age'], title= "𝐃𝐢𝐬𝐭𝐫𝐢𝐛𝐮𝐭𝐢𝐨𝐧 𝐨𝐟 𝐀𝐠𝐞 𝐨𝐟 𝐏𝐥𝐚𝐲𝐞𝐫𝐬", x="age",barmode="group",nbins = 20,color_discrete_sequence=['green'])
fig.update_layout(bargap=0.01)
fig.show()

In [None]:
counts_Nationality = cleaned_dataset["nationality"].value_counts()
counts_Nationality = counts_Nationality.reset_index()
top10= counts_Nationality.iloc[0:10,:]
print(top10)

In [None]:
fig_nations = px.bar(top10, title= "Top 10 nations",x="index", y="nationality",labels={
                     "index": "Nations",
                     "nationality": "No. of players"
                 })
fig_nations.show()

Thus, we see that the maximum numbers of players are from **England** which are **1667**.

### Percentages of Nations of top 50 FIFA players

In [None]:
fig = px.pie(cleaned_dataset.head(50),names='nationality',title='Percentage of Nations of the Top 50 FIFA Players')
fig.show()

Thus, highest number of players are from **France**, followed by **Brazil**.

### Percentage of Clubs among the top 50 FIFA players

In [None]:
fig = px.pie(cleaned_dataset.head(50),names='club',title='Percentage of Clubs among the Top 50 FIFA Players')
fig.show()

Thus, **Manchester City** has the highest representation among the top 50 FIFA players.

## Preferred Foot

Now, we will check the preferred foot while playing.

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Histogram(histfunc="count",  x=cleaned_dataset["preferred_foot"]),row=1, col=1)
fig.show()
px.pie(cleaned_dataset,'preferred_foot')

Thus, most of the players prefer **right** foot for playing.

## Top rated Players by ranking

Graph showing potential of Players.

In [None]:
fig = px.bar(cleaned_dataset.head(10), y='potential',x='short_name',color='age',
             labels={'Overall Rating v Nation of Top 20'}, height=400)
fig.update_layout(title='Comparison of Potential of Top 10 FIFA Players',xaxis_title='Player Name',yaxis_title='Potential')
fig.show()

### International Reputation

In [None]:
repocount = cleaned_dataset["international_reputation"].value_counts()
repocount = repocount.reset_index()
reputation_rankings= repocount.iloc[0:]
print(reputation_rankings)

Thus, we see there are **5** reputation rankings.

Let's plot this now.

In [None]:
fig_repo = px.bar(reputation_rankings,x="index", y="international_reputation",labels={
                     "index": "Level",
                     "international_reputation": "Numbers"
                 },color_discrete_sequence=['chocolate'])
fig_repo.show()

As the proportion of players who have International Reputation of 4 and 5 is not clear in the graphs, it is because these players are very less in number. Thus, we will see them in the section below.

In [None]:
cleaned_dataset.loc[dataset['international_reputation'] == 4].head()

In [None]:
cleaned_dataset.loc[dataset['international_reputation'] == 5].head()

### Wages and Value of Players

Let us find the top 5 countries by wages.

In [None]:
overall_wage = cleaned_dataset.groupby("nationality", as_index=False)["wage_eur","overall"].mean()
overall_wage.sort_values(by="wage_eur",inplace=True,ascending=False)
overall_wage.head()
topwage10 = overall_wage.iloc[0:5,:]
print(topwage10)

In [None]:
fig_wage = px.bar(topwage10,y="nationality", x="wage_eur",labels={
                     "nationality": "Nationality",
                     "wage_eur": "Wage"
                 },orientation="h",color_discrete_sequence=['turquoise'])

fig_wage.show()

Thus, **Dominican Republic** has the highest wage.

We will now compare players on the basis of their values and potential.

Let's list out the top 10 players by potential.

In [None]:
cleaned_dataset.sort_values("potential", axis = 0, ascending = False)[['short_name', 'club','potential', 'overall','value_eur']].head(10)

In [None]:
fig = go.Figure(data=go.Scatter(x=cleaned_dataset['overall'],y=cleaned_dataset['value_eur'],mode='markers',marker=dict(size=10,color=cleaned_dataset['age'],showscale=True),text=cleaned_dataset['short_name']))
fig.update_layout(title='Scatter Plot for Overall Rating v Value in Euros',xaxis_title='Overall Rating',yaxis_title='Value in Euros')

fig.show()

Thus, **Neymar Jr.** is the player with the highest value of **105.5M**.

In [None]:
fig = go.Figure(data=go.Scatter(x=cleaned_dataset['potential'],y=cleaned_dataset['wage_eur'],mode='markers',marker=dict(size=10,color=cleaned_dataset['age'],showscale=True),text=cleaned_dataset['short_name']))
fig.update_layout(title='Scatter Plot for Potential Rating v Wage in Euros',xaxis_title='Potential Rating',yaxis_title='Wage in Euros')
fig.show()

Thus, **L. Messi** is the highest paid footballer of all time.

## Weight and BMI

We will now see the distribution of weight of players and their BMIs vs their index.

In [None]:
cleaned_dataset['BMI'] = cleaned_dataset['weight_kg'] / (cleaned_dataset['height_cm'] / 100) ** 2  #Calculating BMI and creating a column.

Now we will plot the weight distribution of Players

In [None]:
fig = px.histogram(cleaned_dataset['weight_kg'], labels={
                     "weight_kg": "Weight Distribution",
                     "count": "No. of players"},title= "Weight Of Players", x="weight_kg",barmode="group",nbins = 10,color_discrete_sequence=['red'])
fig.update_layout(bargap=0.01)
fig.show()

Thus, most of the players have weight in the range of **70-80** kg.

In [None]:
fig = go.Figure(data=go.Scatter(x=cleaned_dataset['overall'],y=cleaned_dataset['BMI'],mode='markers',marker=dict(size=10,color=cleaned_dataset['age'],showscale=True),text=cleaned_dataset['short_name']))
fig.update_layout(title='Scatter Plot for Overall Rating vs BMI of a Player',xaxis_title='Overall Rating',yaxis_title='BMI of a Player')
fig.show()

Thus, the player with the highest BMI is **Adebayo Akinfenwa** with a BMI of 34.17

# 3D Plots

Here is a 3D plot comparing Potential, Overall and Value in Euros of Top 10 FIFA Players.

In [None]:
fig = px.scatter_3d(cleaned_dataset.head(10), x='potential', y='overall', z='value_eur',
              color='short_name')
fig.update_layout(title='3D Plot of Potential, Overall and Value in Euros of Top 10 FIFA Players')
fig.show()

# Conclusion

Thus, I learned about Exploratory Data Analysis using the FIFA 20 player dataset.
This is the end of notebook. Hope you enjoyed it!