# Simple EDA for [Euro-2020-top-players](https://www.kaggle.com/bakar31/euro-2020-top-players) dataset



In [None]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/euro-2020-top-players/euro2020.csv")
df.head(10)

### Summary of the the dataset

In [None]:
summary = pd.DataFrame(df.dtypes, columns=["datatype"])
summary["isnull"] = df.isnull().sum() 
summary["unique"] = df.nunique()
summary["first"] = df.loc[0]
summary["second"] = df.loc[1]
summary["third"] = df.loc[2]
summary

# Description of the Columns

* <b> Player</b>: Name of the player
* <b>Country</b>: Players country
* <b>Position</b>: Where the player play
* <b>Match played</b>: How many match played
* <b>Goals</b>: How many goals scored
* <b>Right foot goals</b>: How many goals scored by right foot
* <b>Header goals</b>: How many goals scored by head
* <b>Assists</b>: Number of assists
* <b>Total attempts</b>: Total attempts on goal
* <b>On target</b>: Total attempts on target
* <b>Off target</b>: Total attempts off target
* <b>Woodwork</b>: Total attempts hit the post
* <b>Shot blocks</b>: Number of shot blocked by defender
* <b>Avg gpg</b>: Average goals per game
* <b>Fouls suffered</b>: Total fouls suffered
* <b>Fouls committed</b>: Total fouls committed
* <b>Top speed</b>: Top speed
* <b>Passing accuracy</b>: Passing accuracy
* <b>Distance covered</b>: Total distance covered
* <b>Clearence attempted</b>: Total attempts of clearing the ball
* <b>Balls recovered</b>: Total balls recovered
* <b>Tackles</b>: Total tackles
* <b>Blocks</b>: Total shot blocked
* <b>Yellow cards</b>: Total yellow cards
* <b>Red cards</b>: Total red cards
* <b>Minutes played</b>: Total minutes played

### Length of the data frame

In [None]:
len(df)
# We can see that there are just 59 rows and most of the rows are empty for the column Blocks so let's drop that field. 
# It could also be the case that the null values means that the player did zero blocks but for now let's just drop the field

In [None]:
train_df = df.drop("Blocks", axis = 1)

### Let's also impute the null values with zero

In [None]:
train_df = train_df.fillna(axis = 0, value = 0)
# So, in here we have imputed the null values with zero cause that would be the most probable case for value to be null.

In [None]:
len(train_df.columns)

### Univariate Analysis

In [None]:
fig, ax = plt.subplots(13,2, figsize=(100,300))
for idx, col in enumerate(train_df.columns.to_list()):
    row_idx = idx//2
    col_idx = idx%2
    sns.countplot(data=train_df, x = col,  color = "red", ax = ax[row_idx, col_idx])

### Observations made :
1. It could be seen that all players are unique so nothing important in the very first plot.
2. It is apparent that in top players, Italy and England have contributed the maximum number of players that is 7😎 while Sweden, Poland and Switerzland have just contributed one player.😐
3. Players with differnt position where most of players play in Middle Infield position.
4. Most of the players have played more than 4 matches and very few who did not play any but still maintained their position in top players.
5. It can be seen that the max goal by any player is 5 and most of the players did 0 goal.😐
6. Max number of right foot goal is 3 while  min being 0.
7. Max left foot goal is again 3 while min is 0
8. max Header goals by any player is 2
9. Assits have a graph which is left skewed
10. The total attempt to goal is 2 by many players while it can be seen that there are players whose total attempt is 20 and that is amazing
11. Most of the players did one attempt of target goal
12. Off target attempts have max value as 9 
13. Most players have hard luck at hitting the post while max hit is 2.
14. The max number of shot blocked by any defender is 6 while most of the defenders were not able to block.
15. The max average goal per game is 1.25 while min being 0.
16. Max fouls suffered by any player is 18 and the data is left skewed
17. Max fouls committed by any player is 12 but the most of the players commited 5 fouls as it can be seen
18. The player having the highest top speed is 33 while average top speed being around 30 with the data being left skewed.
19. The max passing accuracy is around 98% while min being 66%.
20. Total distance covered seem to be unique for alomost all of the players 
21. Maximum number of clearance attempted can be seen to be around 29 and data is seem to be left skewed.
22. Total balls recovered seem to be unique with few ups and downs.
23. The max number of tackels can be seen to be 30
24. The max number of  yellow card received by any player is 3 while there are many players who did not receive any yellow card.
25. Nothing interesting can really be seen.
26. Minutes played do not have uniform distribution.
    

### Top 10 players with max goals

In [None]:
train_df.columns

In [None]:
top_players = train_df.sort_values(["Goals"], ascending=False).iloc[:10]
train_df["gpa"] = train_df["Goals"]/train_df["Total attempts"]
attempts = train_df.sort_values(["gpa"], ascending=False).iloc[:10]
agp = train_df.sort_values(["Avg gpg"], ascending=False).iloc[:10]
passing_accuracy = train_df.sort_values(["Passing accuracy"], ascending=False).iloc[:10]
header = train_df.sort_values(["Header goals"], ascending=False).iloc[:10]

### Top players on the basis of total goals.

In [None]:
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = top_players, y = "Goals", palette = "Greys", hue = "Position")
plt.title("Top players on the basis of max goals")
plt.show()

### Top players on the basis of average goals per game.

In [None]:
fig, ax = plt.subplots(2, 1,figsize=(20,10))
sns.barplot(x = "Player", data = agp, y = "Avg gpg", palette = "Greys", ax = ax[0])
sns.barplot(x = "Country", data = agp, y = "Avg gpg", palette = "Greens", ax = ax[1])
plt.show()

### Top players with max total goals per attempts.

In [None]:
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = attempts, y = "gpa", palette = "Greens")
plt.title("Top players on the basis of total goal/attempts")
plt.show()

### Great at passing.

In [None]:
fig, ax = plt.subplots(2,1,figsize=(20,10))
sns.barplot(x = "Player", data = passing_accuracy, y = "Passing accuracy", palette = "Reds", ax = ax[0])
sns.barplot(x = "Country", data = passing_accuracy, y = "Passing accuracy", palette = "Blues", ax = ax[1])
plt.title("Top players on the basis of passing accuracy")
plt.show()

### Header goalers.

In [None]:
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = header, y = "Header goals", palette = "Reds", hue = "Goals")
plt.title("Top players on the basis header goals")
plt.show()

### Speedy players.

In [None]:
speedy = train_df.sort_values(["Top speed"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = speedy, y = "Top speed", palette = "Greens")
plt.title("Top playerson the basis of their speed")
plt.show()

### Yellow cards receivers.

In [None]:
yelloww = train_df.sort_values(["Yellow cards"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = yelloww, y = "Yellow cards", palette = "Greens")
plt.title("Yellow card receivers")
plt.show()

### Shot Blockers

In [None]:
blockers = train_df.sort_values(["Shot blocks"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = blockers, y = "Shot blocks", palette = "Blues")
plt.title("Shot Blockers")
plt.show()

### Matches Played

In [None]:
blockers = train_df.sort_values(["Match played"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = blockers, y = "Match played", palette = "Oranges")
plt.title("Matches Played")
plt.show()

### Number of assists

In [None]:
Assisters = train_df.sort_values(["Assists"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Assisters, y = "Assists", palette = "Greens")
plt.title("Matches Played")
plt.show()

## Let's look at bad perfomers

### Bad at passing

In [None]:
Worst_passers = train_df.sort_values(["Passing accuracy"], ascending=True).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Worst_passers, y = "Passing accuracy", palette = "Greens")
plt.title("Not good Passers")
plt.show()

### Fouls suffered

In [None]:
Foulers = train_df.sort_values(["Fouls suffered"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Foulers, y = "Fouls suffered", palette = "Greens")
plt.title("Foul sufferers")
plt.show()

### Foul Commiters

In [None]:
Foulers = train_df.sort_values(["Fouls committed"], ascending=False).iloc[:10]
plt.figure(figsize=(20,5))
sns.barplot(x = "Player", data = Foulers, y = "Fouls committed", palette = "Greens")
plt.title("Foul makers")
plt.show()

### Do let me know if you think I missed some out. your feedback is what will help me learn!📉📊📈
### Do let me know what you think of this notebook in comments!👇
### Happy Learing! 💪💪