In [120]:
import seaborn as sea
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt

from pandas.api.types import is_numeric_dtype

Import data set into a dataframe

In [121]:
fifa_20_players = pd.read_csv("players_20.csv")

# Basic information about the dataset

In [122]:
fifa_20_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18278 entries, 0 to 18277
Columns: 104 entries, sofifa_id to rb
dtypes: float64(16), int64(45), object(43)
memory usage: 14.5+ MB


This dataset consists of:
- 18278 rows and
- 104 columns

List of column names:

In [123]:
for col in fifa_20_players.columns:
    print(col)

sofifa_id
player_url
short_name
long_name
age
dob
height_cm
weight_kg
nationality
club
overall
potential
value_eur
wage_eur
player_positions
preferred_foot
international_reputation
weak_foot
skill_moves
work_rate
body_type
real_face
release_clause_eur
player_tags
team_position
team_jersey_number
loaned_from
joined
contract_valid_until
nation_position
nation_jersey_number
pace
shooting
passing
dribbling
defending
physic
gk_diving
gk_handling
gk_kicking
gk_reflexes
gk_speed
gk_positioning
player_traits
attacking_crossing
attacking_finishing
attacking_heading_accuracy
attacking_short_passing
attacking_volleys
skill_dribbling
skill_curve
skill_fk_accuracy
skill_long_passing
skill_ball_control
movement_acceleration
movement_sprint_speed
movement_agility
movement_reactions
movement_balance
power_shot_power
power_jumping
power_stamina
power_strength
power_long_shots
mentality_aggression
mentality_interceptions
mentality_positioning
mentality_vision
mentality_penalties
mentality_composure
defe

In [124]:
fifa_20_players[fifa_20_players['sofifa_id'].duplicated()]

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb


As we can see, there are no duplicated rows in this dataset based on fifa_id column. You may find 2 or more players with the same name and the same nationality. This is alright, simply because the rest of their attributes would differ.

# Deleting unnecessary columns

We decided that columns listed below would be irrelevant for us in further development. Some of them have a lot of NaN values or they seem irrelevant. Anyway, even without these 5 columns, we should be able to find something interesting in the data.

In [125]:
fifa_20_players = fifa_20_players.drop(columns=['player_url', 'dob','body_type','nation_position','nation_jersey_number'])

In [126]:
fifa_20_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18278 entries, 0 to 18277
Data columns (total 99 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   sofifa_id                   18278 non-null  int64  
 1   short_name                  18278 non-null  object 
 2   long_name                   18278 non-null  object 
 3   age                         18278 non-null  int64  
 4   height_cm                   18278 non-null  int64  
 5   weight_kg                   18278 non-null  int64  
 6   nationality                 18278 non-null  object 
 7   club                        18278 non-null  object 
 8   overall                     18278 non-null  int64  
 9   potential                   18278 non-null  int64  
 10  value_eur                   18278 non-null  int64  
 11  wage_eur                    18278 non-null  int64  
 12  player_positions            18278 non-null  object 
 13  preferred_foot              182

# Replacing NaN values for specific attributes

We do have players in the dataframe who do not have any values assigned in specific columns. In most cases, it depends on the player's position. Some attributes may be goalkeeper related, others may not.

In [127]:
for (columnName, columnData) in fifa_20_players.iteritems():
   print(columnName + " " + str(columnData.isnull().sum().sum()))

sofifa_id 0
short_name 0
long_name 0
age 0
height_cm 0
weight_kg 0
nationality 0
club 0
overall 0
potential 0
value_eur 0
wage_eur 0
player_positions 0
preferred_foot 0
international_reputation 0
weak_foot 0
skill_moves 0
work_rate 0
real_face 0
release_clause_eur 1298
player_tags 16779
team_position 240
team_jersey_number 240
loaned_from 17230
joined 1288
contract_valid_until 240
pace 2036
shooting 2036
passing 2036
dribbling 2036
defending 2036
physic 2036
gk_diving 16242
gk_handling 16242
gk_kicking 16242
gk_reflexes 16242
gk_speed 16242
gk_positioning 16242
player_traits 10712
attacking_crossing 0
attacking_finishing 0
attacking_heading_accuracy 0
attacking_short_passing 0
attacking_volleys 0
skill_dribbling 0
skill_curve 0
skill_fk_accuracy 0
skill_long_passing 0
skill_ball_control 0
movement_acceleration 0
movement_sprint_speed 0
movement_agility 0
movement_reactions 0
movement_balance 0
power_shot_power 0
power_jumping 0
power_stamina 0
power_strength 0
power_long_shots 0
mental

All attributes, beginning with "gk", were given the value of 0.0. It is because only goalkeepers have values assigned for these specific attributes and there are not many of them in this dataset. We have done the same thing with column "release_clause_eur". Attributes such as "player_tags", "player_traits", "loaned_from" were given String of "None". Column "joined" received a random date, where there were NaN values.

In [128]:
fifa_20_players['gk_diving'] = fifa_20_players['gk_diving'].replace(np.nan, 0.0)
fifa_20_players['gk_handling'] = fifa_20_players['gk_handling'].replace(np.nan, 0.0)
fifa_20_players['gk_kicking'] = fifa_20_players['gk_kicking'].replace(np.nan, 0.0)
fifa_20_players['gk_reflexes'] = fifa_20_players['gk_reflexes'].replace(np.nan, 0.0)
fifa_20_players['gk_speed'] = fifa_20_players['gk_speed'].replace(np.nan, 0.0)
fifa_20_players['gk_positioning'] = fifa_20_players['gk_positioning'].replace(np.nan, 0.0)
fifa_20_players['release_clause_eur'] = fifa_20_players['release_clause_eur'].replace(np.nan, 0.0)
fifa_20_players['player_tags'] = fifa_20_players['player_tags'].replace(np.nan, "#None")
fifa_20_players['player_traits'] = fifa_20_players['player_traits'].replace(np.nan, 'None')
fifa_20_players['loaned_from'] = fifa_20_players['loaned_from'].replace(np.nan, 'None')
fifa_20_players['joined'] = fifa_20_players['joined'].replace(np.nan, '2019-08-31')
for (columnName, columnData) in fifa_20_players.iteritems():
   print(columnName + " " + str(columnData.isnull().sum().sum()))

sofifa_id 0
short_name 0
long_name 0
age 0
height_cm 0
weight_kg 0
nationality 0
club 0
overall 0
potential 0
value_eur 0
wage_eur 0
player_positions 0
preferred_foot 0
international_reputation 0
weak_foot 0
skill_moves 0
work_rate 0
real_face 0
release_clause_eur 0
player_tags 0
team_position 240
team_jersey_number 240
loaned_from 0
joined 0
contract_valid_until 240
pace 2036
shooting 2036
passing 2036
dribbling 2036
defending 2036
physic 2036
gk_diving 0
gk_handling 0
gk_kicking 0
gk_reflexes 0
gk_speed 0
gk_positioning 0
player_traits 0
attacking_crossing 0
attacking_finishing 0
attacking_heading_accuracy 0
attacking_short_passing 0
attacking_volleys 0
skill_dribbling 0
skill_curve 0
skill_fk_accuracy 0
skill_long_passing 0
skill_ball_control 0
movement_acceleration 0
movement_sprint_speed 0
movement_agility 0
movement_reactions 0
movement_balance 0
power_shot_power 0
power_jumping 0
power_stamina 0
power_strength 0
power_long_shots 0
mentality_aggression 0
mentality_interceptions 0

After replacing NaN values, we can still some attributes with missing values. We have decided to leave it as it is for now. The ones missing 2036 values represent goalkeepers, who do not have this attribute defined in the game. We will have to work around it later, if we decide to use one these attributes.

# Characteristics of Attributes

Here we have picked, the most important attributes, that we might later use:

sofifa_id - unique id for each player

short_name - player's name

age - number that defines player's age

height_cm - number that defines player's height in centimetres

weight_kg - number that defines player's weight in kilograms

nationality - country of origin

club - club a player plays for

overall - whole number between 0 and 99 which represents how good the player is at his position, the higher the number, the better player he is, applies to all attributes that are defines in this range

potential - whole number between 0 and 99 which is greater or equal to overall

value_eur - player's value in euros

wage_eur - player's wage in euros

player_positions - list of positions a player is capable of playing at

preferred_foot - either left or fight

real_face - either yes or no

team_position - position a player play on within his club

pace, shooting, passing, dribbling, defending, physic - whole numbers between 0 and 99, goalkeepers do not have values assigned for these attributes

gk_diving, gk_handling, gk_kicking, gk_reflexes, gk_speed, gk_positioning - whole numbers between 0 and 99, only goalkeepers have values assigned for these atrributes 

each attribute after "gk_positioning" until "ls" has a whole number assigned between 0 and 99. This whole number tells you how good the player is at that particular activity. 