

# FIFA - Predict Player Position 





## Data manipulation

First let's import some packages:

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

%matplotlib inline 

Load the data:

In [34]:
df = pd.read_csv('./CompleteDataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,RB,RCB,RCM,RDM,RF,RM,RS,RW,RWB,ST
0,0,Cristiano Ronaldo,32,https://cdn.sofifa.org/48/18/players/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Real Madrid CF,https://cdn.sofifa.org/24/18/teams/243.png,...,61,53,82,62,91,89,92,91,66,92
1,1,L. Messi,30,https://cdn.sofifa.org/48/18/players/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,93,93,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,...,57,45,84,59,92,90,88,91,62,88
2,2,Neymar,25,https://cdn.sofifa.org/48/18/players/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,94,Paris Saint-Germain,https://cdn.sofifa.org/24/18/teams/73.png,...,59,46,79,59,88,87,84,89,64,84
3,3,L. Suárez,30,https://cdn.sofifa.org/48/18/players/176580.png,Uruguay,https://cdn.sofifa.org/flags/60.png,92,92,FC Barcelona,https://cdn.sofifa.org/24/18/teams/241.png,...,64,58,80,65,88,85,88,87,68,88
4,4,R. Lewandowski,28,https://cdn.sofifa.org/48/18/players/188545.png,Poland,https://cdn.sofifa.org/flags/37.png,91,91,FC Bayern Munich,https://cdn.sofifa.org/24/18/teams/21.png,...,58,57,78,62,87,82,88,84,61,88


Check out columns:

In [35]:
df.columns

Index(['Unnamed: 0', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall',
       'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control',
       'Composure', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free kick accuracy', 'GK diving', 'GK handling', 'GK kicking',
       'GK positioning', 'GK reflexes', 'Heading accuracy', 'Interceptions',
       'Jumping', 'Long passing', 'Long shots', 'Marking', 'Penalties',
       'Positioning', 'Reactions', 'Short passing', 'Shot power',
       'Sliding tackle', 'Sprint speed', 'Stamina', 'Standing tackle',
       'Strength', 'Vision', 'Volleys', 'CAM', 'CB', 'CDM', 'CF', 'CM', 'ID',
       'LAM', 'LB', 'LCB', 'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB',
       'Preferred Positions', 'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM',
       'RS', 'RW', 'RWB', 'ST'],
      dtype='object')

Gather only columns that we need for this analysis purpose:

In [36]:
# GK attributes are not our interest
columns_needed = ['Aggression', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free kick accuracy', 'Heading accuracy', 'Long shots', 'Penalties',
       'Shot power', 'Volleys', 'Short passing', 'Long passing',
       'Interceptions', 'Marking', 'Sliding tackle', 'Acceleration', 'Agility', 'Reactions','Sprint speed',
       'Positioning', 'Preferred Positions']

# attack attribute first, then defence, then mixed
columns_needed_rearranged = ['Aggression', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free kick accuracy', 'Heading accuracy', 'Long shots', 'Penalties',
       'Shot power', 'Volleys', 'Short passing', 'Long passing',
       'Interceptions', 'Marking', 'Sliding tackle', 'Acceleration', 'Agility', 'Reactions','Sprint speed',
       'Positioning','Preferred Positions']

df = df[columns_needed_rearranged]
df.head()

Unnamed: 0,Aggression,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,Heading accuracy,Long shots,Penalties,Shot power,...,Long passing,Interceptions,Marking,Sliding tackle,Acceleration,Agility,Reactions,Sprint speed,Positioning,Preferred Positions
0,63,85,81,91,94,76,88,92,85,94,...,77,29,22,23,89,89,96,91,95,ST LW
1,48,77,89,97,95,90,71,88,74,85,...,87,22,13,26,92,90,95,87,93,RW
2,56,75,81,96,89,84,62,77,81,80,...,75,36,21,33,94,96,88,90,90,LW
3,78,77,86,86,94,84,77,86,85,87,...,64,41,30,38,88,86,93,77,92,ST
4,80,62,77,85,91,84,85,83,81,88,...,65,39,25,19,79,78,91,83,91,ST


We don't want to classify GK because it will be too obvious:

In [37]:
df['Preferred Positions'] = df['Preferred Positions'].str.strip()
df = df[df['Preferred Positions'] != 'GK']
df.head()



Unnamed: 0,Aggression,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,Heading accuracy,Long shots,Penalties,Shot power,...,Long passing,Interceptions,Marking,Sliding tackle,Acceleration,Agility,Reactions,Sprint speed,Positioning,Preferred Positions
0,63,85,81,91,94,76,88,92,85,94,...,77,29,22,23,89,89,96,91,95,ST LW
1,48,77,89,97,95,90,71,88,74,85,...,87,22,13,26,92,90,95,87,93,RW
2,56,75,81,96,89,84,62,77,81,80,...,75,36,21,33,94,96,88,90,90,LW
3,78,77,86,86,94,84,77,86,85,87,...,64,41,30,38,88,86,93,77,92,ST
4,80,62,77,85,91,84,85,83,81,88,...,65,39,25,19,79,78,91,83,91,ST


Check any missing data:

In [38]:
df.isnull().values.any()

False

All possible outcome for preferred position:

In [39]:
p = df['Preferred Positions'].str.split().apply(lambda x: x[0]).unique()
p

array(['ST', 'RW', 'LW', 'CDM', 'CB', 'RM', 'CM', 'LM', 'LB', 'CAM', 'RB',
       'CF', 'RWB', 'LWB'], dtype=object)

Handle players with multiple preferred positions: duplicate a set of data for each

In [40]:
# copy a structure
df_new = df.copy()
df_new.drop(df_new.index, inplace=True)

for i in p:
    df_temp = df[df['Preferred Positions'].str.contains(i)]
    df_temp['Preferred Positions'] = i
    df_new = df_new.append(df_temp, ignore_index=True)
    
df_new.iloc[::2000, :]
            



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['Preferred Positions'] = i
  df_new = df_new.append(df_temp, ignore_index=True)


Unnamed: 0,Aggression,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,Heading accuracy,Long shots,Penalties,Shot power,...,Long passing,Interceptions,Marking,Sliding tackle,Acceleration,Agility,Reactions,Sprint speed,Positioning,Preferred Positions
0,63,85,81,91,94,76,88,92,85,94,...,77,29,22,23,89,89,96,91,95,ST
2000,34,48,55,68,64,50,52,57,52,61,...,48,14,17,16,83,79,58,87,65,ST
4000,51,33,32,56,59,30,59,57,61,68,...,52,29,12,19,64,61,59,73,61,RW
6000,67,68,75,65,51,75,43,75,84,76,...,72,66,65,57,42,63,75,34,65,CDM
8000,78,43,48,35,22,58,75,45,49,45,...,65,75,77,75,55,62,70,60,21,CB
10000,71,44,48,53,35,35,50,49,42,63,...,58,50,60,61,65,64,58,72,32,CB
12000,76,76,75,68,60,78,36,73,61,74,...,72,52,41,45,76,83,69,75,66,RM
14000,77,53,48,64,65,46,69,64,50,77,...,64,78,75,76,58,52,72,62,59,CM
16000,80,58,41,60,56,57,60,59,58,62,...,61,65,61,64,72,73,63,78,56,CM
18000,49,67,70,79,66,64,35,71,55,74,...,62,30,35,39,79,80,59,78,63,LM


Some of the attributes have '+/-' sign, let's perform the calculation rather than keeping them as string:

In [41]:
cols = [col for col in df_new.columns if col not in ['Preferred Positions']]

for i in cols:
    df_new[i] = df_new[i].apply(lambda x: eval(x) if isinstance(x,str) else x)

df_new.iloc[::1000, :]

Unnamed: 0,Aggression,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,Heading accuracy,Long shots,Penalties,Shot power,...,Long passing,Interceptions,Marking,Sliding tackle,Acceleration,Agility,Reactions,Sprint speed,Positioning,Preferred Positions
0,63,85,81,91,94,76,88,92,85,94,...,77,29,22,23,89,89,96,91,95,ST
1000,71,32,54,61,74,54,75,59,75,79,...,72,31,21,21,51,46,68,55,72,ST
2000,34,48,55,68,64,50,52,57,52,61,...,48,14,17,16,83,79,58,87,65,ST
3000,34,42,41,51,55,42,55,47,60,48,...,38,22,14,20,68,59,58,63,57,ST
4000,51,33,32,56,59,30,59,57,61,68,...,52,29,12,19,64,61,59,73,61,RW
5000,30,61,60,67,51,46,45,44,61,61,...,59,24,28,32,65,65,36,69,57,LW
6000,67,68,75,65,51,75,43,75,84,76,...,72,66,65,57,42,63,75,34,65,CDM
7000,69,66,68,60,52,67,57,67,67,62,...,68,61,57,36,64,63,64,56,60,CDM
8000,78,43,48,35,22,58,75,45,49,45,...,65,75,77,75,55,62,70,60,21,CB
9000,83,69,58,56,52,59,75,51,54,65,...,67,50,64,67,62,60,72,65,59,CB


## Predict binary targets (attack vs defend positions) with logistic regression

The pattern after normalization looks much more obvious. Lets do below:

* Normalize the whole dataset


* Reclassify the target value (preferred positions) to binary groups as below:
- 1 =  attack positions = ST, RW, LW, RM, CM, LM, CAM, CF 
- 0 = defened positions = CDM, CB, LB, RB, RWB, LWB



In [42]:
df_new.columns

Index(['Aggression', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free kick accuracy', 'Heading accuracy', 'Long shots', 'Penalties',
       'Shot power', 'Volleys', 'Short passing', 'Long passing',
       'Interceptions', 'Marking', 'Sliding tackle', 'Acceleration', 'Agility',
       'Reactions', 'Sprint speed', 'Positioning', 'Preferred Positions'],
      dtype='object')

In [43]:
df_new

Unnamed: 0,Aggression,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,Heading accuracy,Long shots,Penalties,Shot power,...,Long passing,Interceptions,Marking,Sliding tackle,Acceleration,Agility,Reactions,Sprint speed,Positioning,Preferred Positions
0,63,85,81,91,94,76,88,92,85,94,...,77,29,22,23,89,89,96,91,95,ST
1,78,77,86,86,94,84,77,86,85,87,...,64,41,30,38,88,86,93,77,92,ST
2,80,62,77,85,91,84,85,83,81,88,...,65,39,25,19,79,78,91,83,91,ST
3,50,68,74,84,91,62,86,82,70,88,...,59,20,12,18,78,75,88,80,92,ST
4,80,80,78,90,85,78,70,82,77,84,...,73,42,30,35,88,90,87,84,86,ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26872,53,59,45,47,34,29,37,26,35,28,...,42,45,52,54,68,58,53,58,39,LWB
26873,55,39,27,55,22,29,46,20,72,66,...,31,47,55,56,61,60,53,62,44,LWB
26874,60,53,36,59,35,36,51,33,47,27,...,30,56,55,57,63,58,58,36,51,LWB
26875,35,42,40,53,38,32,38,33,46,53,...,41,21,29,36,72,57,44,76,39,LWB


In [44]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_new_normalized = df_new.copy()
df_new_normalized.iloc[:,:-1] = sc.fit_transform(df_new_normalized.iloc[:,:-1])
df_new_normalized

Unnamed: 0,Aggression,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,Heading accuracy,Long shots,Penalties,Shot power,...,Long passing,Interceptions,Marking,Sliding tackle,Acceleration,Agility,Reactions,Sprint speed,Positioning,Preferred Positions
0,0.254667,2.123072,1.844637,2.392691,2.734360,1.809867,2.792342,2.513553,2.560085,2.559310,...,1.627410,-1.155090,-1.359375,-1.390702,1.716498,1.751052,3.757887,1.928843,2.805490,ST
1,1.287037,1.524866,2.180027,1.978141,2.734360,2.340653,1.824316,2.121844,2.560085,2.016999,...,0.547792,-0.504833,-0.937779,-0.598291,1.630336,1.501791,3.419027,0.699630,2.588743,ST
2,1.424686,0.403228,1.576325,1.895232,2.543128,2.340653,2.528335,1.925989,2.235123,2.094472,...,0.630839,-0.613209,-1.201277,-1.602012,0.854879,0.837095,3.193121,1.226436,2.516494,ST
3,-0.640054,0.851883,1.375091,1.812322,2.543128,0.880991,2.616338,1.860704,1.341478,2.094472,...,0.132554,-1.642784,-1.886370,-1.654840,0.768718,0.587835,2.854262,0.963033,2.588743,ST
4,1.424686,1.749193,1.643403,2.309781,2.160666,1.942564,1.208299,1.860704,1.910161,1.784581,...,1.295219,-0.450644,-0.937779,-0.756773,1.630336,1.834139,2.741309,1.314236,2.155249,ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26872,-0.433580,0.178901,-0.570172,-1.255342,-1.090269,-1.308504,-1.695781,-1.795252,-1.501937,-2.553902,...,-1.279254,-0.288080,0.221610,0.246949,-0.092900,-0.824643,-1.099096,-0.968586,-1.240453,LWB
26873,-0.295930,-1.316616,-1.777576,-0.592063,-1.855195,-1.308504,-0.903759,-2.186961,1.503959,0.390068,...,-2.192777,-0.179704,0.379708,0.352604,-0.696033,-0.658469,-1.099096,-0.617383,-0.879208,LWB
26874,0.048193,-0.269754,-1.173874,-0.260424,-1.026525,-0.844065,-0.463747,-1.338257,-0.527052,-2.631375,...,-2.275824,0.307990,0.379708,0.405431,-0.523709,-0.824643,-0.534330,-2.900206,-0.373465,LWB
26875,-1.672424,-1.092289,-0.905562,-0.757883,-0.835294,-1.109459,-1.607779,-1.338257,-0.608292,-0.617079,...,-1.362301,-1.588596,-0.990479,-0.703945,0.251747,-0.907730,-2.115673,0.611829,-1.240453,LWB


In [45]:
mapping = {'ST': 1, 'RW': 1, 'LW': 1, 'RM': 1, 'CM': 1, 'LM': 1, 'CAM': 1, 'CF': 1, 'CDM': 0, 'CB': 0, 'LB': 0, 'RB': 0, 'RWB': 0, 'LWB': 0}
df_new_normalized['Preferred Positions'] = df_new['Preferred Positions']
df_new_normalized = df_new_normalized.replace({'Preferred Positions': mapping})

df_new_normalized.iloc[::2000,]


Unnamed: 0,Aggression,Crossing,Curve,Dribbling,Finishing,Free kick accuracy,Heading accuracy,Long shots,Penalties,Shot power,...,Long passing,Interceptions,Marking,Sliding tackle,Acceleration,Agility,Reactions,Sprint speed,Positioning,Preferred Positions
0,0.254667,2.123072,1.844637,2.392691,2.73436,1.809867,2.792342,2.513553,2.560085,2.55931,...,1.62741,-1.15509,-1.359375,-1.390702,1.716498,1.751052,3.757887,1.928843,2.80549,1
2000,-1.741249,-0.643634,0.100608,0.485765,0.822045,0.084811,-0.375745,0.228581,-0.12085,0.002704,...,-0.780969,-1.967913,-1.622873,-1.760494,1.199527,0.920182,-0.53433,1.577639,0.638021,1
4000,-0.571229,-1.765271,-1.442186,-0.509153,0.503326,-1.242155,0.240272,0.228581,0.610314,0.545014,...,-0.448779,-1.15509,-1.88637,-1.602012,-0.437548,-0.575383,-0.421377,0.348427,0.349025,1
6000,0.529966,0.851883,1.442169,0.237035,-0.006624,1.743519,-1.167767,1.40371,2.478845,1.164797,...,1.212172,0.849871,0.906703,0.405431,-2.333107,-0.409209,1.385872,-3.075807,0.638021,0
8000,1.287037,-1.017513,-0.368938,-2.25026,-1.855195,0.615597,1.648311,-0.554838,-0.364571,-1.236863,...,0.630839,1.337564,1.539097,1.356325,-1.213004,-0.492296,0.821107,-0.792985,-2.540934,0
10000,0.805264,-0.942737,-0.368938,-0.757883,-1.026525,-0.910414,-0.55175,-0.293698,-0.933254,0.15765,...,0.049507,-0.017139,0.643206,0.616741,-0.351386,-0.326122,-0.53433,0.260626,-1.746196,0
12000,1.149388,1.45009,1.442169,0.485765,0.56707,1.942564,-1.783784,1.27314,0.610314,1.009852,...,1.212172,0.091237,-0.358085,-0.228498,0.596394,1.25253,0.708154,0.524029,0.710269,1
14000,1.218212,-0.269754,-0.368938,0.154125,0.885789,-0.180582,1.120296,0.685575,-0.28333,1.24227,...,0.547792,1.500129,1.433698,1.409153,-0.954518,-1.323165,1.047013,-0.617383,0.204527,1
16000,1.424686,0.104125,-0.838484,-0.177514,0.312095,0.549249,0.328275,0.359151,0.366593,0.080177,...,0.298649,0.795683,0.695905,0.775223,0.251747,0.421661,0.030435,0.787431,-0.01222,1
18000,-0.708878,0.777107,1.106779,1.397773,0.949533,1.013687,-1.871786,1.14257,0.122872,1.009852,...,0.381697,-1.100902,-0.674282,-0.545463,0.854879,1.003269,-0.421377,0.787431,0.493523,1


In [46]:
import pickle
pickle.dump(sc,open("scaler.pkl","wb"))

Split train test dataset:

In [47]:
X_train, X_test, y_train, y_test = train_test_split(df_new_normalized.iloc[:,:-1], df_new_normalized.iloc[:,-1], random_state=0)

print('X train shape: {}'.format(X_train.shape))
print('X test shape: {}'.format(X_test.shape))
print('y train shape: {}'.format(y_train.shape))
print('y test shape: {}'.format(y_test.shape))


X train shape: (20157, 21)
X test shape: (6720, 21)
y train shape: (20157,)
y test shape: (6720,)


Apply logistic regression to training set:

In [48]:
clf_d = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
acc_d = clf_d.score(X_test, y_test)
print ('Dummy Classifier (most frequent class): {}'.format(acc_d))

clf = LogisticRegression().fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print ('Logistic Regression Accuracy: {}'.format(acc))


Dummy Classifier (most frequent class): 0.6086309523809523
Logistic Regression Accuracy: 0.8632440476190476


In [49]:
import pickle
pickle.dump(clf,open("model.pkl","wb"))