In [None]:
import numpy as np
import pandas as pd
import altair as alt
# alt.data_transformers.disable_max_rows()
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from altair import pipe, limit_rows, to_values
t = lambda data: pipe(data, limit_rows(max_rows=20000), to_values)
alt.data_transformers.register('custom', t)
alt.data_transformers.enable('custom')

In [None]:
data1 = pd.read_csv('src/data_fifa.csv', index_col = 0)
data1.head()

In [None]:
X = data1.drop(columns = ['Wage'])
y = data1['Wage']

Splitting data set into train and test in order to test better. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
df = pd.concat([X_train, y_train], axis=1)

Dropping unrelated columns. These columns have no effect or negligible effect on target column.

In [None]:
df = df.drop(columns = ['ID', 'Photo', 'Flag', 'Club Logo', 'Loaned From','Joined', 'Contract Valid Until'])
df.head()

In [None]:
df.info()

There are 14565 rows in the data and around 3K NA values. So we can drop NAN rows to get better visualizations.

In [None]:
df = df.dropna()

Columns `Wage`, `Value` and `Release Clause` have currency values that are hard to visualize. I will convert those columns just numbers in 1000 (K). 

In [None]:
df['Wage'] = df['Wage'].replace({'K': '', 'M': '*1e3', '€' : ""}, regex=True).map(pd.eval).astype(int)
df['Value'] = df['Value'].replace({'K': '', 'M': '*1e3', '€' : ""}, regex=True).map(pd.eval).astype(int)
df['Release Clause'] = df['Release Clause'].replace({'K': '', 'M': '*1e3', '€' : ""}, regex=True).map(pd.eval).astype(int)

Attribute `Height` and `Weight` can be merged into one attribute called `BMI` aka body mass index. And remove original `Height` and `Weight` columns.

In [None]:
def convert_heigt_inches(height_str):
    height = height_str.split("'")
    return int(height[0]) * 12 + int(height[1])

df['Height_Inches'] = df['Height'].apply(convert_heigt_inches).astype(int)
df['Weight_Pounds'] = df['Weight'].replace({'lbs': ''}, regex=True).astype(int)
df['BMI'] = (df['Weight_Pounds']/df['Height_Inches']**2)*703
df = df.drop(columns = ['Height_Inches', 'Weight_Pounds', 'Weight', 'Height'])
df['BMI'].head()

Club attribute can be converted to numeric score like `Club_Value`, `Club_Wage`, `Club_Overall` and `Club_Potential` to relate with players wages.

In [None]:
def get_club_stat(x, df, name):
    return df[df['Club'] == x][name].values[0]

club_df = df.groupby(['Club'])['Value'].agg('sum').reset_index()
df['Club_Value'] = df['Club'].apply(lambda x: get_club_stat(x, club_df, 'Value'))

club_df = df.groupby(['Club'])['Wage'].agg('sum').reset_index()
df['Club_Wage'] = df['Club'].apply(lambda x: get_club_stat(x, club_df, 'Wage'))

club_df = df.groupby(['Club'])['Overall'].agg('mean').reset_index()
df['Club_Overall'] = df['Club'].apply(lambda x: get_club_stat(x, club_df, 'Overall'))

club_df = df.groupby(['Club'])['Potential'].agg('mean').reset_index()
df['Club_Potential'] = df['Club'].apply(lambda x: get_club_stat(x, club_df, 'Potential'))

club_info = df.drop_duplicates('Club')[['Club', 'Club_Value', 'Club_Wage', 'Club_Overall', 'Club_Potential']].reset_index().drop(columns = ['index'])

Most richest 5 clubs: 

In [None]:
club_info.nlargest(5, 'Club_Wage')

We can look at the correlation matrix to see which features have a high correlation with the wage. These features will be valuable to use for our analysis.

In [None]:
c = df.corr().abs()
c.nlargest(20, 'Wage')['Wage']

The `Club` information like `Club_Value`, `Club_Wage`, `Club_Overall`, `Club_Potential` will be good to analyse and also some skill set of the players such as `Reaction`, `Composure`, `BallControl`, `ShortPassing` and also `Drippling` are some important features to use.

We also need to be aware of that there might be some other features highly related with the wage, but not having highly positive and negative correlations such as `Age` and `BMI`.

In [None]:
p_age = alt.Chart(df).mark_bar().encode(
            alt.X("Age",  bin=alt.Bin(maxbins=30)),
            y='count()',
            ).properties( 
                        width = 300, 
                        height = 200, 
                        title= 'Distribution of Age')
p_overall = alt.Chart(df).mark_bar().encode(
                alt.X("Overall", bin = True),
                y='count()',
                ).properties( 
                        width = 300, 
                        height = 200, 
                        title= 'Distribution of overall score')
p_age | p_overall

The the distribution of `Age` shows that the players are generally between the age 20 and 35. When we look at the overall ratings, there are only a few players having greater than 90, the general overall rating is around 65-70.

We can also look at the relationship between `Age` and `Wage` as well as `Overall` and `Wage`.

In [None]:
p1 = alt.Chart(df).mark_point().encode(
        x=alt.X('Age', title='Age of the player'),
        y=alt.Y('Wage', title='Wage(in K)')
            ).properties(width = 400, 
                     height = 300, 
                     title= 'The relationship between Age and Salary')
p2 = alt.Chart(df).mark_point().encode(
        x=alt.X('Overall', title='Overall rating of the player'),
        y=alt.Y('Wage', title='Wage(in K)')
            ).properties(width = 400, 
                     height = 300, 
                     title= 'The relationship between Overall and Salary')
p1 | p2

The first plot shows that the players of the age between 20 and 30 earning the most and after the age of 35 the wage drastically drops. The second plot suggests that there are high number of players having overall rating between 70-80 and the players with rating higher than 85 have very high salaries. 

In [None]:
#alt.Chart(data1).mark_bar().encode(
#    alt.Y("Nationality", sort = '-x'),
#    x='count()',
#    ).properties(
#    title= 'Distribution of Nationality')

Looking at the distribution of the `Wage` will also be important to understand the dataset. 

In [None]:
plt.hist(df.Wage, bins = 80)
# set x-axis label and specific size
plt.xlabel('Wage(in K)',size=16)
# set y-axis label and specific size
plt.ylabel('Number of players',size=16)
# set plot title with specific size
plt.title('The distribution of wage',size=16)

So, we see that the distribution of the salary is higly skewed and there some outliers. We can also explore the information about these outliers..

In [None]:
df_outliers = df.nlargest(8, 'Wage')
df_outliers

In [None]:
club_info.nlargest(5, 'Club_Wage')

In [None]:
some_clubs = ('FC Barcelona', 'Real Madrid', 'Arsenal', 'Manchester City', 'Juventus')
df_club = df.loc[df['Club'].isin(some_clubs) & df['Wage']]

fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
ax = sns.boxplot(x="Club", y="Wage", data=df_club);
ax.set_title(label='Distribution of wage in most richest clubs', fontsize=20);

The `Club` information is one of the most important attributes affecting the `Wage`, however, some clubs like Real Madrid have a high variance on wage. Therefore, we need to take other attributes into consideration.

This is not only true for rich clubs but also for any random subset of clubs. 

In [None]:
random_clubs = tuple(df['Club'].sample(n=5, random_state= 20))
df_club = df.loc[df['Club'].isin(random_clubs) & df['Wage']]

fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
ax = sns.boxplot(x="Club", y="Wage", data=df_club);
ax.set_title(label='Distribution of wage in  clubs', fontsize=20);