In [141]:
import pandas as pd   # import pandas
import numpy as np    # import numpy

pd.set_option('display.max_columns', None)
file = pd.read_csv('fifa21_train.csv')    # import the document

In [142]:
# rename columns and lower case them to handle them easier

file = file.rename(columns={'Heading Accuracy':'heading_accuracy','Short Passing':'short_passing', 
                           'FK Accuracy':'fk_accuracy', 'Long Passing':'long_passing', 'Ball Control':'ball_control',
                           'Sprint Speed':'sprint_speed', 'Shot Power':'shot_power', 'Long Shots':'long_shots',
                           'Standing Tackle':'Standing_Tackle', 'Sliding Tackle':'Sliding_Tackle',
                           'GK Diving':'GK_Diving', 'GK Handling':'GK_Handling', 'GK Kicking':'GK_Kicking',
                            'GK Positioning':'GK_Positioning', 'GK Reflexes':'GK_Reflexes'})

colu = []
for colname in file.columns:
    colu.append(colname.lower())
file.columns = colu

In [143]:
pd.options.display.max_rows = 5   # To display all the NaN values and deal with them
file.isnull().sum().sum()         # they are 0 now as we already worked with them (.sum().sum() gives us the total sum)

12117

# Deal with NaN values.

In [144]:
file['position'] = file['position'].fillna(file['bp'])   # position will be filled with the base position

In [145]:
file['composure'] = file['composure'].fillna(np.mean(file['composure']))   # composure is filled with the mean of composure as there were not so many players and we did not want to drop them

In [146]:
# fill the rest of NaN with a simple substraction because we notices the 'totals' include subcategories but we prefer using the subcategories rather than the totals.
file['volleys'] = file['attacking'] - file['crossing'] - file['finishing'] - file['heading_accuracy'] - file['short_passing']
file['curve'] = file['skill'] - file['dribbling'] - file['fk_accuracy'] - file['long_passing'] - file['ball_control']
file['jumping'] = file['power'] - file['shot_power'] - file['stamina'] - file['strength'] - file['long_shots']
file['sliding_tackle'] = file['defending'] - file['marking'] - file['standing_tackle']

In [147]:
# This values are part of a bigger column that will be dropped, but needed to obtain them. 
# We decided to divide the values of 'agility' & 'Balance' as well as 'interceptions', 'vision' &
# 'positioning' equally in their respective 'groups' (as seen in the original DF)
file['agility'] = (file['movement'] - file['acceleration'] - file['sprint_speed'] - file['reactions'])/2
file['balance'] = file['agility']

file['interceptions'] = (file['mentality'] - file['aggression'] - file['penalties'])/3
file['vision'] = file['interceptions']
file['positioning'] = file['interceptions']

In [148]:
# Finally deal with nulls of a/w and d/w, we chose to fill with the mode that also happens to be the Median
print(file['a/w'].mode()) ; print(file['d/w'].mode())
file['a/w'] = file['a/w'].fillna('Medium')
file['d/w'] = file['d/w'].fillna('Medium')

0    Medium
Name: a/w, dtype: object
0    Medium
Name: d/w, dtype: object


In [149]:
# We checked for duplicates with ID and had 0 duplicates
# We also checked for 'name' duplicates and found 469; but after analyzing the data realized they are all different people
print(file.duplicated('id').sum())
print(file.duplicated('name').sum())
file.sort_values("name", inplace = True)
dupli = file[file.duplicated('name',keep=False)==True]   # keep=false - default setting will show the duplicates after the original (so not the original) this makes it show
dupli.tail()

0
469


Unnamed: 0,id,name,age,nationality,club,bp,position,team & contract,height,weight,foot,growth,joined,loan date end,value,wage,release clause,contract,attacking,crossing,finishing,heading_accuracy,short_passing,volleys,skill,dribbling,curve,fk_accuracy,long_passing,ball_control,movement,acceleration,sprint_speed,agility,reactions,balance,power,shot_power,jumping,stamina,strength,long_shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing_tackle,sliding_tackle,goalkeeping,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,total stats,base stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
7173,243753,W. Coulibaly,28,Ivory Coast,Côte d'Ivoire,LB,LB,Ivory Coast Free,"5'10""",152lbs,Left,0,"Jan 1, 2018",,€0,€0,€0,Ivory Coast Free,253,67,42,50,62,32,263,67,39,41,50,66,351,72,72,72.0,63,72.0,290,35,70,74,69,42,280,64,58.0,58.0,58.0,42,52.0,206,65,71,70,45,12,8,8,6,11,1688,371,3 ★,2★,Medium,Medium,1 ★,72,41,56,67,66,69,25,55+2,55+2,55+2,61+0,58+0,58+0,58+0,61+0,59+2,59+2,59+2,63+2,60+2,60+2,60+2,63+2,68+1,64+2,64+2,64+2,68+1,67+2,66+2,66+2,66+2,67+2,15+2,69
6572,216437,Wallace,25,Brazil,Lazio,CB,CB,Lazio 2016 ~ 2021,"6'3""",187lbs,Right,4,"Jul 28, 2016",,€6.5M,€38K,€11.5M,2016 ~ 2021,225,34,21,75,59,36,209,45,33,29,50,52,286,63,66,43.0,71,43.0,268,40,71,51,80,26,257,79,44.333333,44.333333,44.333333,45,68.0,225,74,77,74,58,15,11,11,10,11,1528,333,2 ★,2★,Medium,Medium,2 ★,65,28,44,48,76,72,25,47+2,47+2,47+2,45+0,45+0,45+0,45+0,45+0,45+2,45+2,45+2,47+2,50+2,50+2,50+2,47+2,62+2,65+2,65+2,65+2,62+2,65+2,74+2,74+2,74+2,65+2,18+2,74
6801,188370,Wallace,31,Brazil,Göztepe SK,CB,CB,Göztepe SK 2018 ~ 2020,"6'1""",168lbs,Right,0,"Jan 31, 2018",,€3M,€19K,€6M,2018 ~ 2020,302,52,52,73,68,57,248,50,25,47,68,58,243,34,48,46.5,68,46.5,307,62,84,40,76,45,299,79,55.666667,55.666667,55.666667,53,66.0,214,70,74,70,47,10,9,7,13,8,1660,346,2 ★,2★,Medium,Medium,1 ★,42,52,59,52,73,68,3,56+2,56+2,56+2,52+0,53+0,53+0,53+0,52+0,54+2,54+2,54+2,53+2,59+2,59+2,59+2,53+2,61+2,68+2,68+2,68+2,61+2,63+2,72+1,72+1,72+1,63+2,16+2,73
797,187896,Wanderson,33,Brazil,Helsingborgs IF,CAM,ST CAM LM,Helsingborgs IF 2018 ~ 2019,"5'11""",165lbs,Right,0,"Aug 7, 2018",,€400K,€2K,€500K,2018 ~ 2019,321,62,65,64,67,63,331,67,67,65,64,68,302,57,55,63.0,64,63.0,310,73,50,60,62,65,280,66,48.666667,48.666667,48.666667,68,69.0,79,45,15,19,56,8,10,9,13,16,1679,346,3 ★,4★,Medium,Medium,2 ★,56,67,65,67,29,62,4,65+1,65+1,65+1,65+0,66+0,66+0,66+0,65+0,65+1,65+1,65+1,64+2,60+2,60+2,60+2,64+2,47+2,48+2,48+2,48+2,47+2,44+2,42+2,42+2,42+2,44+2,17+2,66
737,220171,Wanderson,32,Bulgaria,Bulgaria,CF,LM RM CAM,Bulgaria Free,"5'6""",134lbs,Right,0,"Jul 1, 2014",,€0,€0,€0,Free,301,56,68,42,67,68,329,69,63,66,61,70,375,78,76,77.0,67,77.0,298,71,42,67,51,67,267,41,56.0,56.0,56.0,58,55.0,93,19,40,34,53,11,7,12,10,13,1716,363,3 ★,3★,Medium,Medium,1 ★,77,68,63,71,31,53,3,67+2,67+2,67+2,69+0,70+-1,70+-1,70+-1,69+0,69+0,69+0,69+0,68+1,64+2,64+2,64+2,68+1,53+2,50+2,50+2,50+2,53+2,50+2,41+2,41+2,41+2,50+2,17+2,69


In [150]:
file = file.drop(['id', 'nationality', 'club', 'team & contract',
                 'height', 'weight', 'foot', 'joined', 'loan date end',
                 'value', 'wage', 'release clause', 'contract', 'attacking',
                 'skill', 'movement', 'power', 'mentality', 'defending',
                 'goalkeeping', 'total stats', 'base stats', 'w/f', 'ir' ],axis=1)

In [151]:
pd.options.display.max_rows = 5   # To display all the NaN values and deal with them
file.isnull().sum().sum()

0

In [152]:
file

Unnamed: 0,name,age,bp,position,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,sm,a/w,d/w,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
9722,A. Abdellaoui,27,CB,CB LB,1,57,25,58,54,32,47,39,35,46,55,66,64,59.0,57,59.0,45,86,59,67,24,62,51.333333,51.333333,51.333333,45,53.0,63,66,65,15,10,14,13,14,2★,Medium,Medium,65,31,51,52,63,65,8,48+2,48+2,48+2,50+0,49+0,49+0,49+0,50+0,49+2,49+2,49+2,52+2,51+2,51+2,51+2,52+2,59+2,58+2,58+2,58+2,59+2,61+2,63+1,63+1,63+1,61+2,18+2,63
1014,A. Abdennour,29,CB,CB,0,56,48,74,58,32,48,48,39,62,51,52,50,54.0,63,54.0,74,77,61,83,60,75,55.666667,55.666667,55.666667,51,65.0,63,69,71,12,8,7,8,15,2★,Low,Medium,51,55,56,51,68,76,10,57+2,57+2,57+2,52+0,53+0,53+0,53+0,52+0,53+2,53+2,53+2,53+2,57+2,57+2,57+2,53+2,61+2,64+2,64+2,64+2,61+2,63+2,69+1,69+1,69+1,63+2,16+2,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5754,Éverton Luiz,32,CDM,CDM CM,0,63,42,56,69,57,67,38,70,65,68,58,49,69.0,68,69.0,76,71,58,61,64,78,61.000000,61.000000,61.000000,57,67.0,70,66,71,10,8,10,14,11,2★,Medium,Medium,53,55,65,67,68,64,5,58+2,58+2,58+2,61+0,61+0,61+0,61+0,61+0,62+2,62+2,62+2,61+2,65+2,65+2,65+2,61+2,66+2,68+1,68+1,68+1,66+2,65+2,67+2,67+2,67+2,65+2,17+2,69
536,Đoàn Văn Hâu,20,LB,LB,10,51,35,64,62,27,55,30,34,55,57,75,76,65.5,59,65.5,64,68,76,70,56,69,43.666667,43.666667,43.666667,36,62.0,59,66,64,13,10,8,6,13,2★,High,High,76,45,50,58,63,71,4,53+2,53+2,53+2,54+0,52+0,52+0,52+0,54+0,53+2,53+2,53+2,55+2,55+2,55+2,55+2,55+2,63+2,61+2,61+2,61+2,63+2,64+2,64+2,64+2,64+2,64+2,16+2,65


In [153]:
pd.options.display.max_rows = 50
file['hits'] = file['hits'].str.replace('K','000')
file['hits'] = file['hits'].str.replace('.','')
file['hits'] = pd.to_numeric(file['hits'], errors='coerce')
file['hits'].unique

  file['hits'] = file['hits'].str.replace('.','')


<bound method Series.unique of 9722      8
1014     10
6805      5
11454    30
3633     12
         ..
9064     27
4780      4
6754      3
5754      5
536       4
Name: hits, Length: 11701, dtype: int64>

In [154]:
file.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,11701.0,25.27049,4.95764,16.0,21.0,25.0,29.0,43.0
growth,11701.0,5.534655,5.810903,-1.0,0.0,4.0,10.0,26.0
crossing,11701.0,51.593795,17.872747,6.0,41.0,56.0,65.0,94.0
finishing,11701.0,48.048116,19.399617,3.0,33.0,52.0,64.0,95.0
heading_accuracy,11701.0,53.49406,16.956195,5.0,46.0,56.0,65.0,93.0
short_passing,11701.0,60.4368,13.971811,8.0,56.0,63.0,69.0,94.0
volleys,11701.0,44.909409,17.901015,0.0,32.0,47.0,59.0,90.0
dribbling,11701.0,57.852491,18.050938,5.0,53.0,63.0,70.0,96.0
curve,11701.0,49.443979,18.383852,0.0,37.0,52.0,64.0,94.0
fk_accuracy,11701.0,44.35826,17.484142,5.0,32.0,43.0,58.0,94.0


In [155]:
data = file.copy()
data

Unnamed: 0,name,age,bp,position,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,sm,a/w,d/w,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
9722,A. Abdellaoui,27,CB,CB LB,1,57,25,58,54,32,47,39,35,46,55,66,64,59.0,57,59.0,45,86,59,67,24,62,51.333333,51.333333,51.333333,45,53.0,63,66,65,15,10,14,13,14,2★,Medium,Medium,65,31,51,52,63,65,8,48+2,48+2,48+2,50+0,49+0,49+0,49+0,50+0,49+2,49+2,49+2,52+2,51+2,51+2,51+2,52+2,59+2,58+2,58+2,58+2,59+2,61+2,63+1,63+1,63+1,61+2,18+2,63
1014,A. Abdennour,29,CB,CB,0,56,48,74,58,32,48,48,39,62,51,52,50,54.0,63,54.0,74,77,61,83,60,75,55.666667,55.666667,55.666667,51,65.0,63,69,71,12,8,7,8,15,2★,Low,Medium,51,55,56,51,68,76,10,57+2,57+2,57+2,52+0,53+0,53+0,53+0,52+0,53+2,53+2,53+2,53+2,57+2,57+2,57+2,53+2,61+2,64+2,64+2,64+2,61+2,63+2,69+1,69+1,69+1,63+2,16+2,70
6805,A. Abdi,26,LWB,LB,3,67,37,49,59,41,64,61,44,48,63,78,83,65.0,63,65.0,47,60,77,66,39,62,59.000000,59.000000,59.000000,41,63.0,57,61,61,13,13,13,10,9,2★,High,Medium,81,41,58,64,58,68,5,56+2,56+2,56+2,61+0,59+0,59+0,59+0,61+0,59+2,59+2,59+2,63+2,58+2,58+2,58+2,63+2,65+2,60+2,60+2,60+2,65+2,64+2,60+2,60+2,60+2,64+2,17+2,66
11454,A. Abedzadeh,27,GK,GK,2,12,10,13,24,12,10,13,14,24,15,30,25,44.5,70,44.5,53,55,21,60,11,26,23.666667,23.666667,23.666667,11,34.0,13,14,14,72,70,70,72,74,1★,Medium,Medium,72,70,70,74,27,72,30,25+2,25+2,25+2,22+0,24+0,24+0,24+0,22+0,26+2,26+2,26+2,23+2,26+2,26+2,26+2,23+2,21+2,25+2,25+2,25+2,21+2,21+2,24+2,24+2,24+2,21+2,72+2,73
3633,A. Abrashi,30,CDM,CDM CM,0,49,47,38,64,45,58,51,45,58,63,65,64,84.0,68,84.0,71,74,82,67,54,89,61.000000,61.000000,61.000000,54,58.0,72,75,76,16,16,13,13,12,2★,Medium,High,64,54,57,64,70,76,12,57+2,57+2,57+2,59+0,59+0,59+0,59+0,59+0,60+2,60+2,60+2,60+2,63+2,63+2,63+2,60+2,68+2,70+1,70+1,70+1,68+2,68+2,69+2,69+2,69+2,68+2,20+2,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9064,Éder,33,ST,ST,0,72,78,66,78,76,78,80,78,62,81,88,87,85.0,78,85.0,79,50,81,65,76,60,61.333333,61.333333,61.333333,78,78.0,31,20,26,8,12,11,10,9,4★,Medium,Low,87,78,75,80,29,67,27,78+1,78+1,78+1,80+-1,80+-1,80+-1,80+-1,80+-1,79+0,79+0,79+0,79+0,71+3,71+3,71+3,79+0,59+3,54+3,54+3,54+3,59+3,55+3,45+3,45+3,45+3,55+3,17+3,79
4780,Élton,34,LM,CAM RM,0,73,63,40,67,64,73,69,78,65,69,71,70,81.0,63,81.0,70,64,64,34,68,44,56.666667,56.666667,56.666667,72,65.0,34,29,31,13,15,12,10,8,4★,High,Medium,70,66,68,72,34,45,4,62+2,62+2,62+2,68+-1,67+0,67+0,67+0,68+-1,67+0,67+0,67+0,68+-1,63+2,63+2,63+2,68+-1,55+2,51+2,51+2,51+2,55+2,51+2,41+2,41+2,41+2,51+2,17+2,67
6754,Érico Sousa,24,RM,LM CAM,3,55,49,52,57,48,58,41,39,54,54,79,73,65.0,54,65.0,60,63,58,56,51,59,52.666667,52.666667,52.666667,42,53.0,48,43,44,6,10,11,11,13,2★,Medium,Medium,76,52,54,58,46,57,3,56+2,56+2,56+2,58+0,57+0,57+0,57+0,58+0,57+2,57+2,57+2,58+2,54+2,54+2,54+2,58+2,53+2,51+2,51+2,51+2,53+2,53+2,50+2,50+2,50+2,53+2,15+2,59
5754,Éverton Luiz,32,CDM,CDM CM,0,63,42,56,69,57,67,38,70,65,68,58,49,69.0,68,69.0,76,71,58,61,64,78,61.000000,61.000000,61.000000,57,67.0,70,66,71,10,8,10,14,11,2★,Medium,Medium,53,55,65,67,68,64,5,58+2,58+2,58+2,61+0,61+0,61+0,61+0,61+0,62+2,62+2,62+2,61+2,65+2,65+2,65+2,61+2,66+2,68+1,68+1,68+1,66+2,65+2,67+2,67+2,67+2,65+2,17+2,69


In [156]:
X = data.drop(['ova'],axis=1)   # OVA is our target
y = data['ova']

X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)

In [157]:
from sklearn.preprocessing import MinMaxScaler 

transformer = MinMaxScaler().fit(X_num)
x_normalized = transformer.transform(X_num)
print(x_normalized.shape)
data_normalized = pd.DataFrame(x_normalized, columns = X_num.columns)
pd.DataFrame(x_normalized, columns=X_num.columns)

print(type(data_normalized))

(11701, 43)
<class 'pandas.core.frame.DataFrame'>


In [158]:
X_cat1=X_cat.drop(columns=['position', 'ls', 'st', 'rs', 'lw', 'lf',
                           'cf', 'rf', 'rw' ,'lam', 'cam','ram', 'lm',
                            'lcm', 'cm','rcm', 'rm', 'lwb', 'ldm' , 'cdm',
                            'rdm' , 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb',
                            'gk'],axis=1)

In [159]:
X_num.corr()

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits
age,1.0,-0.854136,0.15928,0.110223,0.178446,0.185496,0.17934,0.04414,0.173597,0.231401,0.230267,0.129641,-0.183392,-0.175948,-0.077099,0.504708,-0.077099,0.307396,0.168928,0.069533,0.350674,0.198765,0.27928,0.242512,0.242512,0.242512,0.201801,0.413964,0.158495,0.10987,0.079784,0.116316,0.119441,0.125299,0.129068,0.116089,-0.176232,0.311175,0.402361,0.242675,0.225285,0.434448,-0.044662
growth,-0.854136,1.0,-0.252015,-0.189302,-0.228983,-0.271381,-0.251627,-0.153463,-0.253594,-0.275789,-0.294287,-0.224358,0.026837,0.013074,-0.039872,-0.562717,-0.039872,-0.368967,-0.22105,-0.22623,-0.371032,-0.27713,-0.317922,-0.324221,-0.324221,-0.324221,-0.231044,-0.465516,-0.199809,-0.145174,-0.120153,-0.04952,-0.04773,-0.047191,-0.055057,-0.045935,0.019677,-0.334891,-0.439781,-0.320569,-0.240244,-0.484123,0.040944
crossing,0.15928,-0.252015,1.0,0.645621,0.43557,0.800162,0.664806,0.854544,0.818334,0.751382,0.740585,0.834774,0.633555,0.601544,0.660918,0.372938,0.660918,0.533293,0.08146,0.634695,-0.055725,0.733978,0.457167,0.820503,0.820503,0.820503,0.629131,0.590896,0.417238,0.404626,0.378677,-0.647155,-0.644804,-0.628339,-0.645034,-0.649351,0.303331,0.205157,0.50426,0.445988,0.22619,-0.000844,0.036906
finishing,0.110223,-0.189302,0.645621,1.0,0.455388,0.650934,0.875771,0.820629,0.747704,0.695429,0.485792,0.783835,0.567471,0.550783,0.572061,0.330846,0.572061,0.726205,0.054275,0.472761,0.003737,0.883367,0.240675,0.633437,0.633437,0.633437,0.839377,0.560906,-0.033656,-0.073428,-0.116006,-0.578322,-0.576235,-0.560607,-0.574026,-0.57548,0.279905,0.608192,0.321891,0.468894,-0.265831,-0.061217,0.05013
heading_accuracy,0.178446,-0.228983,0.43557,0.455388,1.0,0.630159,0.478915,0.531864,0.403207,0.366814,0.478891,0.639741,0.255302,0.31627,0.15151,0.331619,0.15151,0.372332,0.36923,0.570417,0.504878,0.480324,0.68574,0.594774,0.594774,0.594774,0.537305,0.516048,0.535808,0.513173,0.476353,-0.713628,-0.711295,-0.693076,-0.705803,-0.711708,-0.170158,-0.115646,-0.092903,-0.146766,0.376336,0.401288,0.019749
short_passing,0.185496,-0.271381,0.800162,0.650934,0.630159,1.0,0.673312,0.839028,0.752634,0.719229,0.886005,0.913722,0.508665,0.496319,0.553398,0.490238,0.553398,0.585329,0.162841,0.686334,0.141147,0.752279,0.614463,0.883164,0.883164,0.883164,0.665128,0.716718,0.550348,0.531136,0.488904,-0.717932,-0.715735,-0.697825,-0.712055,-0.719149,0.095696,0.153317,0.466105,0.348521,0.360711,0.167888,0.049795
volleys,0.17934,-0.251627,0.664806,0.875771,0.478915,0.673312,1.0,0.789894,0.810603,0.729989,0.534099,0.773288,0.509738,0.489444,0.591051,0.379871,0.591051,0.739453,0.135098,0.464982,0.037756,0.857321,0.311398,0.667062,0.667062,0.667062,0.820655,0.610954,0.054833,0.016005,-0.004173,-0.564526,-0.570284,-0.577161,-0.567359,-0.569596,0.21148,0.538622,0.366331,0.44755,-0.159121,-0.00838,0.045972
dribbling,0.04414,-0.153463,0.854544,0.820629,0.531864,0.839028,0.789894,1.0,0.829886,0.744447,0.704079,0.938222,0.713093,0.686167,0.713499,0.353965,0.713499,0.617583,0.09419,0.657167,-0.040269,0.841421,0.43674,0.819521,0.819521,0.819521,0.758656,0.631071,0.299085,0.2764,0.243403,-0.753101,-0.750538,-0.732805,-0.748854,-0.752566,0.330614,0.2831,0.361337,0.493366,0.050767,-0.047277,0.055826
curve,0.173597,-0.253594,0.818334,0.747704,0.403207,0.752634,0.810603,0.829886,1.0,0.835364,0.682583,0.810013,0.556903,0.51664,0.674736,0.39326,0.674736,0.671723,0.115029,0.539393,-0.049607,0.820317,0.381058,0.774804,0.774804,0.774804,0.726952,0.633895,0.249561,0.226667,0.215055,-0.581268,-0.587455,-0.596169,-0.586761,-0.590282,0.24695,0.398113,0.511517,0.494821,0.054465,-0.024401,0.047926
fk_accuracy,0.231401,-0.275789,0.751382,0.695429,0.366814,0.719229,0.729989,0.744447,0.835364,1.0,0.689447,0.746248,0.446731,0.405067,0.538774,0.376534,0.538774,0.662166,0.018933,0.481112,-0.032598,0.805765,0.371314,0.721136,0.721136,0.721136,0.730167,0.598287,0.251366,0.236422,0.195164,-0.528536,-0.527341,-0.508734,-0.526201,-0.529288,0.142495,0.40251,0.527765,0.440015,0.083423,-0.011374,0.036962


In [160]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [161]:
# OneHotEncoder
X_cat_encoded = X_cat1['bp']
X_cat_encoded = pd.DataFrame(X_cat_encoded, columns=['bp'])

encoder = OneHotEncoder(drop='first').fit(X_cat_encoded)
encoded = encoder.transform(X_cat_encoded).toarray()
encoded
cols = encoder.get_feature_names(input_features=X_cat_encoded.columns)
onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded.head()



Unnamed: 0,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [162]:
# Label Encoder for the values: A/W, D/W, SM

# we want to know what are the values given
print(X_cat1['a/w'].unique())
print(X_cat1['d/w'].unique())
print(X_cat1['sm'].unique())

['Medium' 'Low' 'High']
['Medium' 'High' 'Low']
['2★' '1★' '3★' '4★' '5★']


In [163]:
# Create a list with the values for the label encoder

# just for a/w
X_cat_aw = X_cat1['a/w']
X_cat_aw = pd.DataFrame(X_cat_aw, columns=['a/w'])

# just for d/w
X_cat_dw = X_cat1['d/w']
X_cat_dw = pd.DataFrame(X_cat_dw, columns=['d/w'])

# # just for sm
X_cat_sm = X_cat1['sm']
X_cat_sm = pd.DataFrame(X_cat_sm, columns=['sm'])

# just for a/w
label_encoded_aw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_aw) # give a value to each type
label_encoded_aw = pd.DataFrame(label_encoded_aw,columns=X_cat_aw.columns)
display(label_encoded_aw.head())

# just for d/w
label_encoded_dw = LabelEncoder().fit(['Low', 'Medium', 'High']).transform(X_cat_dw) # give a value to each type
label_encoded_dw = pd.DataFrame(label_encoded_dw,columns=X_cat_dw.columns)
display(label_encoded_dw.head())

# just for sm
label_encoded_sm = LabelEncoder().fit(['1★','2★', '3★', '4★','5★']).transform(X_cat_sm) # give a value to each type
label_encoded_sm = pd.DataFrame(label_encoded_sm,columns=X_cat_sm.columns)
display(label_encoded_sm.head())

  y = column_or_1d(y, warn=True)


Unnamed: 0,a/w
0,2
1,1
2,0
3,2
4,2


  y = column_or_1d(y, warn=True)


Unnamed: 0,d/w
0,2
1,2
2,2
3,2
4,0


  y = column_or_1d(y, warn=True)


Unnamed: 0,sm
0,1
1,1
2,1
3,0
4,1


In [164]:
# Concat DataFrames

X = pd.concat([data_normalized,label_encoded_aw, label_encoded_dw, label_encoded_sm, onehot_encoded], axis=1)
X

Unnamed: 0,age,growth,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,fk_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,pac,sho,pas,dri,def,phy,hits,a/w,d/w,sm,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST
0,0.407407,0.074074,0.579545,0.239130,0.602273,0.534884,0.355556,0.461538,0.414894,0.337079,0.440476,0.549451,0.630952,0.623529,0.617801,0.458333,0.617801,0.397590,0.905263,0.552941,0.610390,0.222222,0.609195,0.611111,0.611111,0.611111,0.430233,0.488095,0.674157,0.714286,0.722222,0.157303,0.093023,0.139535,0.120879,0.146067,0.557143,0.184211,0.382353,0.358209,0.662338,0.575758,0.000054,2,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.481481,0.037037,0.568182,0.489130,0.784091,0.581395,0.355556,0.472527,0.510638,0.382022,0.630952,0.505495,0.464286,0.458824,0.565445,0.541667,0.565445,0.746988,0.810526,0.576471,0.818182,0.622222,0.758621,0.662698,0.662698,0.662698,0.500000,0.630952,0.674157,0.750000,0.788889,0.123596,0.069767,0.058140,0.065934,0.157303,0.357143,0.500000,0.455882,0.343284,0.727273,0.742424,0.000076,1,2,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.370370,0.148148,0.693182,0.369565,0.500000,0.593023,0.455556,0.648352,0.648936,0.438202,0.464286,0.637363,0.773810,0.847059,0.680628,0.541667,0.680628,0.421687,0.631579,0.764706,0.597403,0.388889,0.609195,0.702381,0.702381,0.702381,0.383721,0.607143,0.606742,0.654762,0.677778,0.134831,0.127907,0.127907,0.087912,0.089888,0.785714,0.315789,0.485294,0.537313,0.597403,0.621212,0.000022,0,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.407407,0.111111,0.068182,0.076087,0.090909,0.186047,0.133333,0.054945,0.138298,0.101124,0.178571,0.109890,0.202381,0.164706,0.465969,0.638889,0.465969,0.493976,0.578947,0.105882,0.519481,0.077778,0.195402,0.281746,0.281746,0.281746,0.034884,0.261905,0.112360,0.095238,0.155556,0.797753,0.790698,0.790698,0.769231,0.820225,0.657143,0.697368,0.661765,0.686567,0.194805,0.681818,0.000293,2,2,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.518519,0.037037,0.488636,0.478261,0.375000,0.651163,0.500000,0.582418,0.542553,0.449438,0.583333,0.637363,0.619048,0.623529,0.879581,0.611111,0.879581,0.710843,0.778947,0.823529,0.610390,0.555556,0.919540,0.726190,0.726190,0.726190,0.534884,0.547619,0.775281,0.821429,0.844444,0.168539,0.162791,0.127907,0.120879,0.123596,0.542857,0.486842,0.470588,0.537313,0.753247,0.742424,0.000098,2,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11696,0.629630,0.037037,0.750000,0.815217,0.693182,0.813953,0.844444,0.802198,0.851064,0.820225,0.630952,0.835165,0.892857,0.894118,0.890052,0.750000,0.890052,0.807229,0.526316,0.811765,0.584416,0.800000,0.586207,0.730159,0.730159,0.730159,0.813953,0.785714,0.314607,0.166667,0.288889,0.078652,0.116279,0.104651,0.087912,0.089888,0.871429,0.802632,0.735294,0.776119,0.220779,0.606061,0.000261,2,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11697,0.666667,0.037037,0.761364,0.652174,0.397727,0.686047,0.711111,0.747253,0.734043,0.820225,0.666667,0.703297,0.690476,0.694118,0.848168,0.541667,0.848168,0.698795,0.673684,0.611765,0.181818,0.711111,0.402299,0.674603,0.674603,0.674603,0.744186,0.630952,0.348315,0.273810,0.344444,0.134831,0.151163,0.116279,0.087912,0.078652,0.628571,0.644737,0.632353,0.656716,0.285714,0.272727,0.000011,0,2,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11698,0.296296,0.148148,0.556818,0.500000,0.534091,0.569767,0.533333,0.582418,0.436170,0.382022,0.535714,0.538462,0.785714,0.729412,0.680628,0.416667,0.680628,0.578313,0.663158,0.541176,0.467532,0.522222,0.574713,0.626984,0.626984,0.626984,0.395349,0.488095,0.505618,0.440476,0.488889,0.056180,0.093023,0.104651,0.098901,0.134831,0.714286,0.460526,0.426471,0.447761,0.441558,0.454545,0.000000,2,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
11699,0.592593,0.037037,0.647727,0.423913,0.579545,0.709302,0.633333,0.681319,0.404255,0.730337,0.666667,0.692308,0.535714,0.447059,0.722513,0.611111,0.722513,0.771084,0.747368,0.541176,0.532468,0.666667,0.793103,0.726190,0.726190,0.726190,0.569767,0.654762,0.752809,0.714286,0.788889,0.101124,0.069767,0.093023,0.131868,0.112360,0.385714,0.500000,0.588235,0.582090,0.727273,0.560606,0.000022,2,2,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
# Apply linear regression & Model Validation
# We defined a function to do all the next steps together:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import math

In [166]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('X_train.shape: ', X_train.shape)
print('X_test.shape: ',X_test.shape)
print('y_train.shape: ',y_train.shape)
print('y_test.shape: ',y_test.shape)
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_train)
print('r2_score,y_train, predictions: ',r2_score(y_train, predictions))
predictions_test = lm.predict(X_test)
print('y train prediction: ',r2_score(y_train, predictions))
print('y test prediction: ',r2_score(y_test, predictions_test))
mse=mean_squared_error(y_test,predictions_test)
print('mse: ',mse)
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
print('rmse: ',rmse)
mae = mean_absolute_error(y_test, predictions_test)
print('mae: ',mae)
print('y test mean: ',y_test.mean())

X_train.shape:  (9360, 60)
X_test.shape:  (2341, 60)
y_train.shape:  (9360,)
y_test.shape:  (2341,)
r2_score,y_train, predictions:  0.909563942149592
y train prediction:  0.909563942149592
y test prediction:  0.9080161227485429
mse:  4.3362417899614565
rmse:  2.0823644709707896
mae:  1.6107426433965142
y test mean:  66.76420333190944
