In [321]:
# Predicting soccer player's value based on Fifa 19 data.

In [463]:
# Convert CSV to Pandas and examine some entries

import os
import pandas as pd

def loadData():
    soccerPath = os.path.join("datasets", "soccer")
    csv_path = os.path.join(soccerPath, "fifa.csv")
    return pd.read_csv(csv_path, encoding = "ISO-8859-1")

data = loadData() #Our Pandas DataFrame
data.head()       #Displays the first 5 rows of our DataFrame

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,â¬226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,â¬127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,â¬228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,â¬138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,â¬196.4M


In [464]:
# We are given the overall rating of the player, therefore we don't need indivdual attributes that are used in
# the calculation and thus dropping columns 55 to 88. We also don't need their overalls in different positions 
# such as Messi's overall as a LB since we assume they are playing in their natural position and thus dropping 
# columns 29 to 54 as well.
#
# Photo, flag and club logo columns contain links to images, these columns (5,7,11) are not needed either.
#
# Removing other irrelevant information. Years left in contract, height, weight, body type etc.

s1 = data.columns.to_series()[28:88]
s2 = data.columns.to_series()[[4,6,10]]
s3 = data.columns.to_series()[[0,1,5,9,13,14,18,19,20,22,23,24,25]]
s3 = pd.concat([s1,s2,s3])
data.drop(s3, axis=1, inplace=True)
data = data.drop(data[data.Value == 0].index)
data = data.drop(data[data.Wage == 0].index)
data = data.drop("Height", axis = 1)
data = data.drop("Weight", axis = 1)
data = data.drop("Position", axis = 1)

data.head() 

Unnamed: 0,Name,Age,Overall,Potential,Value,Wage,International Reputation,Weak Foot,Skill Moves,Release Clause
0,L. Messi,31,94,94,â¬110.5M,â¬565K,5.0,4.0,4.0,â¬226.5M
1,Cristiano Ronaldo,33,94,94,â¬77M,â¬405K,5.0,4.0,5.0,â¬127.1M
2,Neymar Jr,26,92,93,â¬118.5M,â¬290K,5.0,5.0,5.0,â¬228.1M
3,De Gea,27,91,93,â¬72M,â¬260K,4.0,3.0,1.0,â¬138.6M
4,K. De Bruyne,27,91,92,â¬102M,â¬355K,4.0,5.0,4.0,â¬196.4M


In [465]:
#Converting monetary features to strictly numerical values

def toInt(val):
    val = str(val)
    if val[-1:] == "M":
        return (float(val[3:-1]))
    elif val[-1:] == "K":
        return (float(val[3:-1]) / 1000)
    else:
        return 0
    
data['Value'] = data['Value'].apply(toInt)
data['Wage'] = data['Wage'].apply(toInt)
data['Release Clause'] = data['Release Clause'].apply(toInt)

In [466]:
#Creating a new feature, Potential Left. Which describes how much growth a player has left in their career.
# Displying correlation matrix for Value
data["Potential_Left"] = data["Potential"] - data["Overall"]
corr_matrix = data.corr()
corr_matrix["Value"].sort_values(ascending=False)

Value                       1.000000
Release Clause              0.967255
Wage                        0.858442
International Reputation    0.653176
Overall                     0.627085
Potential                   0.576218
Skill Moves                 0.315651
Weak Foot                   0.165221
Age                         0.075022
Potential_Left             -0.147035
Name: Value, dtype: float64

In [467]:
# Some entries have null values for certain features, dropping these all together

data = data[np.isfinite(data['Skill Moves'])]
data = data[np.isfinite(data['International Reputation'])]
data = data[np.isfinite(data['Weak Foot'])]

In [468]:
#Splitting into stratified test and train sets with a test size of 25%

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.25, random_state=42)

train_labels = train_set["Value"].copy() 
test_labels = test_set["Value"].copy() 
train_set = train_set.drop("Value", axis = 1)
test_set = test_set.drop("Value", axis = 1)
train_set = train_set.drop("Name", axis = 1)
test_set = test_set.drop("Name", axis = 1)

In [469]:
num_attribs # These are our features the model will make use of

['Age',
 'Overall',
 'Potential',
 'Wage',
 'International Reputation',
 'Weak Foot',
 'Skill Moves',
 'Release Clause',
 'Potential_Left']

In [470]:
# Creating a pipeline to transform our data. Includes standard scaler to scale features.

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_attribs = list(train_set)

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
    ])

train_prepared = full_pipeline.fit_transform(train_set)
test_prepared = full_pipeline.fit_transform(test_set)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [471]:
train_prepared

array([[ 0.40306649,  0.97325407,  0.27047993, ...,  0.83441355,
         0.19621815, -0.93469614],
       [-0.88204706, -1.04706668, -0.70326211, ..., -1.79580937,
        -0.37052274,  0.5379585 ],
       [-0.23949029, -1.04706668, -0.86555245, ..., -0.48069791,
        -0.36646245,  0.35387667],
       ...,
       [-0.88204706,  0.5403282 ,  0.75735094, ...,  0.83441355,
         0.03569507,  0.16979484],
       [ 1.04562327,  1.6947972 ,  1.08193162, ...,  2.149525  ,
         1.4898453 , -0.93469614],
       [-1.52460384, -1.19137531,  0.75735094, ..., -0.48069791,
        -0.3390791 ,  2.37877681]])

In [472]:
train_labels

2694      4.200
15486     0.170
15606     0.220
4610      2.400
11626     0.575
14148     0.500
12783     0.575
15405     0.220
8759      1.000
15425     0.230
16448     0.140
16501     0.170
13637     0.325
7172      0.950
7341      0.800
2213      8.500
11509     0.270
11437     0.425
11575     0.475
5290      1.600
9378      0.750
2348      6.000
9039      0.425
11038     0.675
10212     0.550
13130     0.550
11856     0.475
9022      1.100
5431      1.800
17541     0.120
          ...  
3890      3.400
3556      3.900
11394     0.625
17960     0.040
1267      8.500
1899      7.500
3005      5.500
189      24.000
2747      5.500
8666      0.500
6396      1.300
17616     0.090
6420      1.400
5051      0.000
5311      0.875
2433      5.000
769      10.500
1685      6.000
8322      0.725
16071     0.030
11363     0.600
14471     0.325
4426      2.800
16898     0.120
6265      1.200
11284     0.575
11964     0.675
5390      2.400
860       9.500
15843     0.260
Name: Value, Length: 136

In [473]:
#Creating Model

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_prepared, train_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [474]:
# Testing and displaying mean absolute error in prediction
from sklearn.metrics import mean_absolute_error

predictions = lin_reg.predict(test_prepared)
lin_mae = mean_absolute_error(test_labels, predictions)
lin_mae 

0.49316434282658916

In [475]:
#Lets see what it predicts Messi's value to be
predictions[2313]

112.48540849407293

In [476]:
# Messi's actual value (millions of euros)
test_labels.iloc[2313]

110.5