In [120]:
import pandas as pd   # import pandas
import numpy as np    # import numpy

pd.set_option('display.max_columns', None)
file = pd.read_csv('fifa21_train.csv')    # import the document

In [121]:
# rename columns and lower case them to handle them easier

file = file.rename(columns={'Heading Accuracy':'heading_accuracy','Short Passing':'short_passing', 
                           'FK Accuracy':'fk_accuracy', 'Long Passing':'long_passing', 'Ball Control':'ball_control',
                           'Sprint Speed':'sprint_speed', 'Shot Power':'shot_power', 'Long Shots':'long_shots',
                           'Standing Tackle':'Standing_Tackle', 'Sliding Tackle':'Sliding_Tackle',
                           'GK Diving':'GK_Diving', 'GK Handling':'GK_Handling', 'GK Kicking':'GK_Kicking',
                            'GK Positioning':'GK_Positioning', 'GK Reflexes':'GK_Reflexes'})

colu = []
for colname in file.columns:
    colu.append(colname.lower())
file.columns = colu

In [122]:
pd.options.display.max_rows = 5   # To display all the NaN values and deal with them
file.isnull().sum()               # they are 0 now as we already worked with them

id      0
name    0
       ..
gk      0
ova     0
Length: 101, dtype: int64

# Deal with NaN values.

In [123]:
file['position'] = file['position'].fillna(file['bp'])   # position will be filled with the base position

In [124]:
file['composure'] = file['composure'].fillna(np.mean(file['composure']))   # composure is filled with the mean of composure as there were not so many players and we did not want to drop them

In [125]:
# fill the rest of NaN with a simple substraction because we notices the 'totals' include subcategories but we prefer using the subcategories rather than the totals.
file['volleys'] = file['attacking'] - file['crossing'] - file['finishing'] - file['heading_accuracy'] - file['short_passing']
file['curve'] = file['skill'] - file['dribbling'] - file['fk_accuracy'] - file['long_passing'] - file['ball_control']
file['jumping'] = file['power'] - file['shot_power'] - file['stamina'] - file['strength'] - file['long_shots']
file['sliding_tackle'] = file['defending'] - file['marking'] - file['standing_tackle']

In [126]:
# This values are part of a bigger column that will be dropped, but needed to obtain them. 
# We decided to divide the values of 'agility' & 'Balance' as well as 'interceptions', 'vision' &
# 'positioning' equally in their respective 'groups' (as seen in the original DF)
file['agility'] = (file['movement'] - file['acceleration'] - file['sprint_speed'] - file['reactions'])/2
file['balance'] = file['agility']

file['interceptions'] = (file['mentality'] - file['aggression'] - file['penalties'])/3
file['vision'] = file['interceptions']
file['positioning'] = file['interceptions']

In [127]:
# Finally deal with nulls of a/w and d/w, we chose to fill with the mode that also happens to be the Median
print(file['a/w'].mode()) ; print(file['d/w'].mode())
file['a/w'] = file['a/w'].fillna('Medium')
file['d/w'] = file['d/w'].fillna('Medium')

0    Medium
Name: a/w, dtype: object
0    Medium
Name: d/w, dtype: object


In [128]:
# We checked for duplicates with ID and had 0 duplicates
# We also checked for 'name' duplicates and found 469; but after analyzing the data realized they are all different people
print(file.duplicated('id').sum())
print(file.duplicated('name').sum())
file.sort_values("name", inplace = True)
dupli = file[file.duplicated('name',keep=False)==True]   # keep=false - default setting will show the duplicates after the original (so not the original) this makes it show
dupli.tail()

0
469


Unnamed: 0,id,name,age,nationality,club,bp,position,team & contract,height,weight,foot,growth,joined,loan date end,value,wage,release clause,contract,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total stats,base stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
7173,243753,W. Coulibaly,28,Ivory Coast,Côte d'Ivoire,LB,LB,Ivory Coast Free,"5'10""",152lbs,Left,0,"Jan 1, 2018",,€0,€0,€0,Ivory Coast Free,253,67,42,50,62,32,263,67,39,41,50,66,351,72,72,72.0,63,72.0,290,35,70,74,69,42,280,64,58.0,58.0,58.0,42,52.0,206,65,71,70,45,12,8,8,6,11,1688,371,3 ★,2★,Medium,Medium,1 ★,72,41,56,67,66,69,25,55+2,55+2,55+2,61+0,58+0,58+0,58+0,61+0,59+2,59+2,59+2,63+2,60+2,60+2,60+2,63+2,68+1,64+2,64+2,64+2,68+1,67+2,66+2,66+2,66+2,67+2,15+2,69
6572,216437,Wallace,25,Brazil,Lazio,CB,CB,Lazio 2016 ~ 2021,"6'3""",187lbs,Right,4,"Jul 28, 2016",,€6.5M,€38K,€11.5M,2016 ~ 2021,225,34,21,75,59,36,209,45,33,29,50,52,286,63,66,43.0,71,43.0,268,40,71,51,80,26,257,79,44.333333,44.333333,44.333333,45,68.0,225,74,77,74,58,15,11,11,10,11,1528,333,2 ★,2★,Medium,Medium,2 ★,65,28,44,48,76,72,25,47+2,47+2,47+2,45+0,45+0,45+0,45+0,45+0,45+2,45+2,45+2,47+2,50+2,50+2,50+2,47+2,62+2,65+2,65+2,65+2,62+2,65+2,74+2,74+2,74+2,65+2,18+2,74
6801,188370,Wallace,31,Brazil,Göztepe SK,CB,CB,Göztepe SK 2018 ~ 2020,"6'1""",168lbs,Right,0,"Jan 31, 2018",,€3M,€19K,€6M,2018 ~ 2020,302,52,52,73,68,57,248,50,25,47,68,58,243,34,48,46.5,68,46.5,307,62,84,40,76,45,299,79,55.666667,55.666667,55.666667,53,66.0,214,70,74,70,47,10,9,7,13,8,1660,346,2 ★,2★,Medium,Medium,1 ★,42,52,59,52,73,68,3,56+2,56+2,56+2,52+0,53+0,53+0,53+0,52+0,54+2,54+2,54+2,53+2,59+2,59+2,59+2,53+2,61+2,68+2,68+2,68+2,61+2,63+2,72+1,72+1,72+1,63+2,16+2,73
797,187896,Wanderson,33,Brazil,Helsingborgs IF,CAM,ST CAM LM,Helsingborgs IF 2018 ~ 2019,"5'11""",165lbs,Right,0,"Aug 7, 2018",,€400K,€2K,€500K,2018 ~ 2019,321,62,65,64,67,63,331,67,67,65,64,68,302,57,55,63.0,64,63.0,310,73,50,60,62,65,280,66,48.666667,48.666667,48.666667,68,69.0,79,45,15,19,56,8,10,9,13,16,1679,346,3 ★,4★,Medium,Medium,2 ★,56,67,65,67,29,62,4,65+1,65+1,65+1,65+0,66+0,66+0,66+0,65+0,65+1,65+1,65+1,64+2,60+2,60+2,60+2,64+2,47+2,48+2,48+2,48+2,47+2,44+2,42+2,42+2,42+2,44+2,17+2,66
737,220171,Wanderson,32,Bulgaria,Bulgaria,CF,LM RM CAM,Bulgaria Free,"5'6""",134lbs,Right,0,"Jul 1, 2014",,€0,€0,€0,Free,301,56,68,42,67,68,329,69,63,66,61,70,375,78,76,77.0,67,77.0,298,71,42,67,51,67,267,41,56.0,56.0,56.0,58,55.0,93,19,40,34,53,11,7,12,10,13,1716,363,3 ★,3★,Medium,Medium,1 ★,77,68,63,71,31,53,3,67+2,67+2,67+2,69+0,70+-1,70+-1,70+-1,69+0,69+0,69+0,69+0,68+1,64+2,64+2,64+2,68+1,53+2,50+2,50+2,50+2,53+2,50+2,41+2,41+2,41+2,50+2,17+2,69


In [129]:
file = file.drop(['id', 'nationality', 'club', 'team & contract',
                 'height', 'weight', 'foot', 'joined', 'loan date end',
                 'value', 'wage', 'release clause', 'contract', 'attacking',
                 'skill', 'movement', 'power', 'mentality', 'defending',
                 'goalkeeping', 'total stats', 'base stats', 'w/f', 'ir' ],axis=1)

In [130]:
pd.options.display.max_rows = 5   # To display all the NaN values and deal with them
file.isnull().sum()

name    0
age     0
       ..
gk      0
ova     0
Length: 77, dtype: int64

In [132]:
file['hits']=file['hits'].str.replace('K','000')
file['hits'] = pd.to_numeric(file['hits'], errors='coerce')

In [136]:
X = file.drop(['ova'],axis=1)   # OVA is our target
y = file['ova']

X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)

In [139]:
from sklearn.preprocessing import MinMaxScaler 

transformer = MinMaxScaler().fit(X_num)
x_normalized = transformer.transform(X_num)
print(x_normalized.shape)
data_normalized = pd.DataFrame(x_normalized, columns = X_num.columns)
pd.DataFrame(x_normalized, columns=X_num.columns)

print(type(data_normalized))

(11701, 43)
<class 'pandas.core.frame.DataFrame'>


In [140]:
X_cat1=X_cat.drop(columns=['position', 'ls', 'st', 'rs', 'lw', 'lf',
                           'cf', 'rf', 'rw' ,'lam', 'cam','ram', 'lm',
                            'lcm', 'cm','rcm', 'rm', 'lwb', 'ldm' , 'cdm',
                            'rdm' , 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb',
                            'gk'],axis=1)

In [141]:
X_num.corr()

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits
age,1.000000,-0.854136,0.159280,0.110223,0.178446,0.185496,0.179340,0.044140,0.173597,0.231401,0.230267,0.129641,-0.183392,-0.175948,-0.077099,0.504708,-0.077099,0.307396,0.168928,0.069533,0.350674,0.198765,0.279280,0.242512,0.242512,0.242512,0.201801,0.413964,0.158495,0.109870,0.079784,0.116316,0.119441,0.125299,0.129068,0.116089,-0.176232,0.311175,0.402361,0.242675,0.225285,0.434448,-0.098374
growth,-0.854136,1.000000,-0.252015,-0.189302,-0.228983,-0.271381,-0.251627,-0.153463,-0.253594,-0.275789,-0.294287,-0.224358,0.026837,0.013074,-0.039872,-0.562717,-0.039872,-0.368967,-0.221050,-0.226230,-0.371032,-0.277130,-0.317922,-0.324221,-0.324221,-0.324221,-0.231044,-0.465516,-0.199809,-0.145174,-0.120153,-0.049520,-0.047730,-0.047191,-0.055057,-0.045935,0.019677,-0.334891,-0.439781,-0.320569,-0.240244,-0.484123,0.078442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
phy,0.434448,-0.484123,-0.000844,-0.061217,0.401288,0.167888,-0.008380,-0.047277,-0.024401,-0.011374,0.187789,0.068816,-0.159813,-0.079448,-0.253960,0.494057,-0.253960,0.210815,0.425402,0.393147,0.847653,0.040817,0.593860,0.241421,0.241421,0.241421,-0.003380,0.353798,0.404056,0.381677,0.355478,0.059076,0.059086,0.057732,0.072035,0.058348,-0.121209,0.057520,0.177747,0.026681,0.507137,1.000000,0.090232
hits,-0.098374,0.078442,0.140628,0.134774,0.080051,0.183163,0.135353,0.174074,0.161284,0.110648,0.148807,0.181552,0.146796,0.146090,0.126427,0.269773,0.126427,0.164423,0.058440,0.122872,0.029685,0.134997,0.085496,0.167126,0.167126,0.167126,0.093043,0.231245,0.064659,0.055750,0.047057,-0.029442,-0.029194,-0.029728,-0.030038,-0.030115,0.170643,0.150437,0.213219,0.249806,0.059674,0.090232,1.000000


In [142]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [143]:
# OneHotEncoder
X_cat_encoded = X_cat1['bp']
X_cat_encoded = pd.DataFrame(X_cat_encoded, columns=['bp'])

encoder = OneHotEncoder(drop='first').fit(X_cat_encoded)
encoded = encoder.transform(X_cat_encoded).toarray()
encoded
cols = encoder.get_feature_names(input_features=X_cat_encoded.columns)
onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded.head()



Unnamed: 0,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
# Label Encoder for the values: A/W, D/W, SM

# we want to know what are the values given
print(X_cat1['a/w'].unique())
print(X_cat1['d/w'].unique())
print(X_cat1['sm'].unique())

['Medium' 'Low' 'High']
['Medium' 'High' 'Low']
['2★' '1★' '3★' '4★' '5★']


In [145]:
# Create a list with the values for the label encoder

# just for a/w
X_cat_aw = X_cat1['a/w']
X_cat_aw = pd.DataFrame(X_cat_aw, columns=['a/w'])

# just for d/w
X_cat_dw = X_cat1['d/w']
X_cat_dw = pd.DataFrame(X_cat_dw, columns=['d/w'])

# # just for sm
X_cat_sm = X_cat1['sm']
X_cat_sm = pd.DataFrame(X_cat_sm, columns=['sm'])

# just for a/w
label_encoded_aw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_aw) # give a value to each type
label_encoded_aw = pd.DataFrame(label_encoded_aw,columns=X_cat_aw.columns)
display(label_encoded_aw.head())

# just for d/w
label_encoded_dw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_dw) # give a value to each type
label_encoded_dw = pd.DataFrame(label_encoded_dw,columns=X_cat_dw.columns)
display(label_encoded_dw.head())

# just for sm
label_encoded_sm = LabelEncoder().fit(['1★','2★', '3★', '4★','5★']).transform(X_cat_sm) # give a value to each type
label_encoded_sm = pd.DataFrame(label_encoded_sm,columns=X_cat_sm.columns)
display(label_encoded_sm.head())

  y = column_or_1d(y, warn=True)


Unnamed: 0,a/w
0,2
1,1
2,0
3,2
4,2


  y = column_or_1d(y, warn=True)


Unnamed: 0,d/w
0,2
1,2
2,2
3,2
4,0


  y = column_or_1d(y, warn=True)


Unnamed: 0,sm
0,1
1,1
2,1
3,0
4,1


In [146]:
# Concat DataFrames

X = pd.concat([data_normalized,label_encoded_aw, label_encoded_dw, label_encoded_sm, onehot_encoded], axis=1)
X

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits,a/w,d/w,sm,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
0,0.407407,0.074074,0.579545,0.239130,0.602273,0.534884,0.355556,0.461538,0.414894,0.337079,0.440476,0.549451,0.630952,0.623529,0.617801,0.458333,0.617801,0.397590,0.905263,0.552941,0.610390,0.222222,0.609195,0.611111,0.611111,0.611111,0.430233,0.488095,0.674157,0.714286,0.722222,0.157303,0.093023,0.139535,0.120879,0.146067,0.557143,0.184211,0.382353,0.358209,0.662338,0.575758,0.00138,2,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.481481,0.037037,0.568182,0.489130,0.784091,0.581395,0.355556,0.472527,0.510638,0.382022,0.630952,0.505495,0.464286,0.458824,0.565445,0.541667,0.565445,0.746988,0.810526,0.576471,0.818182,0.622222,0.758621,0.662698,0.662698,0.662698,0.500000,0.630952,0.674157,0.750000,0.788889,0.123596,0.069767,0.058140,0.065934,0.157303,0.357143,0.500000,0.455882,0.343284,0.727273,0.742424,0.00178,1,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11699,0.592593,0.037037,0.647727,0.423913,0.579545,0.709302,0.633333,0.681319,0.404255,0.730337,0.666667,0.692308,0.535714,0.447059,0.722513,0.611111,0.722513,0.771084,0.747368,0.541176,0.532468,0.666667,0.793103,0.726190,0.726190,0.726190,0.569767,0.654762,0.752809,0.714286,0.788889,0.101124,0.069767,0.093023,0.131868,0.112360,0.385714,0.500000,0.588235,0.582090,0.727273,0.560606,0.00078,2,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11700,0.148148,0.407407,0.511364,0.347826,0.670455,0.627907,0.300000,0.549451,0.319149,0.325843,0.547619,0.571429,0.738095,0.764706,0.685864,0.486111,0.685864,0.626506,0.715789,0.752941,0.649351,0.577778,0.689655,0.519841,0.519841,0.519841,0.325581,0.595238,0.629213,0.714286,0.711111,0.134831,0.093023,0.069767,0.043956,0.134831,0.714286,0.368421,0.367647,0.447761,0.662338,0.666667,0.00058,0,0,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [147]:
# Apply linear regression & Model Validation
# We defined a function to do all the next steps together:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import math

In [148]:
def original():      
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    print('X_train.shape: ', X_train.shape)
    print('X_test.shape: ',X_test.shape)
    print('y_train.shape: ',y_train.shape)
    print('y_test.shape: ',y_test.shape)
    lm = linear_model.LinearRegression()
    lm.fit(X_train,y_train)
    predictions = lm.predict(X_train)
    print('r2_score,y_train, predictions: ',r2_score(y_train, predictions)
#     predictions_test = lm.predict(X_test)
#     print('y train prediction: ',r2_score(y_test, predictions_test))
#     print('y test prediction: ',r2_score(y_test, predictions_test))
#     mse=mean_squared_error(y_test,predictions_test)
#     print('mse: ',mse)
#     rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
#     print('rmse: ',rmse1)
#     mae = mean_absolute_error(y_test, predictions_test)
#     print('mae: ',mae)
#     print('y test mean: ',y_test.mean())
original()

SyntaxError: invalid syntax (945566221.py, line 21)

In [149]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('X_train.shape: ', X_train.shape)
print('X_test.shape: ',X_test.shape)
print('y_train.shape: ',y_train.shape)
print('y_test.shape: ',y_test.shape)
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_train)
print('r2_score,y_train, predictions: ',r2_score(y_train, predictions))
predictions_test = lm.predict(X_test)
print('y train prediction: ',r2_score(y_test, predictions_test))
print('y test prediction: ',r2_score(y_test, predictions_test))
mse=mean_squared_error(y_test,predictions_test)
print('mse: ',mse)
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
print('rmse: ',rmse)
mae = mean_absolute_error(y_test, predictions_test)
print('mae: ',mae)
print('y test mean: ',y_test.mean())

X_train.shape:  (9360, 60)
X_test.shape:  (2341, 60)
y_train.shape:  (9360,)
y_test.shape:  (2341,)
r2_score,y_train, predictions:  0.912187184867781
y train prediction:  0.9071431421672848
y test prediction:  0.9071431421672848
mse:  4.283658110907287
rmse:  2.069700005050801
mae:  1.6029695917800186
y test mean:  66.82058949167023
