In [522]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
sns.set()

In [523]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [524]:
def preprocess(data):

    # Fix the weights (as kg)
    Weight = [] 
    for i in data['Weight']:
        Weight.append(str(i).replace('lbs' , ''))

    data['Weight'] = Weight
    data['Weight'] = data['Weight'].astype(np.number)
    data['Weight'] = data['Weight']/2.2046

    
    # Fix the heights (as cm)
    

    cmheight = []
    for item in data['Height']:
        item = str(item).replace('\"', '')
        feetinches = item.split('\'')
        cmheight.append(round(int(feetinches[0]) * 30.48) + int(feetinches[1]) * 2.54)
    data['Height'] = cmheight
        
    # Remove the stars from columns 'IR', 'W/F', and 'SM'
    newvalues = []
    for item in data['IR']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['IR']=newvalues

    newvalues = []
    for item in data['W/F']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['W/F']=newvalues
    
    newvalues = []
    for item in data['SM']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['SM']=newvalues
    

    # Transform the 'Contract' column to only give the end date of the contract 
#     contractend = []
#     for item in data['Contract']:
#         if 'Free' in item:
#             contractend.append('DELETE')
#         else:
#             item = item.replace(' On Loan', '')
#             item = item[-4:]
#             contractend.append(int(item))
        
#     data['Contract']=contractend

    # Transform the 'loan date end' column to an on loan (yes/no) column
    data['Loan Date End'] = data['Loan Date End'].fillna(0)

    loanyesno = []
    for item in data['Loan Date End']:
        if item == 0:
            loanyesno.append(item)
        else: 
            loanyesno.append(1)

    data['Loan Date End'] = loanyesno
    data = data.rename(columns={'Loan Date End':'On Loan'})

    data['On Loan']
    
    # Remove the plusses and minuses on the last few rows   
    toclean = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']

    superlist = []

    for column in toclean: 
        values = []
        for item in data[column]:
            if item[1] == '-' or item[1] == '+':
                values.append(int(item[0]))
            else: 
                values.append(int(item[:2]))
        superlist.append(values)


    for i in range(len(toclean)):
        data[toclean[i]] = superlist[i]

    # Clean up currency cells by removing € sign and converting 'M' and 'K' to numbers
    
    newvalues1 = []
    for item in data['Value']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item = item * 1000
            newvalues1.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item = item * 1000000      
            newvalues1.append(int(item))
        else: 
            item = int(item)
            newvalues1.append(int(item))
    data['Value']=newvalues1
    
    
    newvalues2 = []
    for item in data['Wage']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues2.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues2.append(int(item))
        else: 
            item = int(item)
            newvalues2.append(int(item))
    data['Wage']=newvalues2
    
    
    newvalues3 = []
    for item in data['Release Clause']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues3.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues3.append(int(item))
        else: 
            item = int(item)
            newvalues3.append(int(item))
    data['Release Clause']=newvalues3
    
    
    # Clean up Hits column (which contains some values with K in them)¶
    newhits = []
    for item in data['Hits']:
        if 'K' in item:
            item = item.replace('K','')
            item = float(item) * 1000
            newhits.append(int(item))
        else: 
            newhits.append(int(item))
    data['Hits'] = newhits

    # Dropping stuff
    
    # Drop columns with irrelevant attributes
    #Some atributes are used to calculate the rating per category, that we already have, so individual attributes are not necessary here

    # Different versions for commenting out and experimenting
    
#     # First, a version with only the summed up columns (Attacking etc)
#     data = data.drop(['Nationality','Club','Team & Contract', 'Position', 'ID','Growth','Joined', 'Contract','Name','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
#            'Dribbling','Curve','FK Accuracy','Long Passing','Ball Control',
#            'Acceleration','Sprint Speed','Agility','Reactions','Balance',
#           'Shot Power','Jumping','Stamina','Strength','Long Shots',
#           'Aggression','Interceptions','Positioning','Vision','Penalties','Composure',
#           'Marking','Standing Tackle','Sliding Tackle',
#           'GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes'], axis=1)
    
    # Then, a version with only the component columns
    data = data.drop(['Nationality','Club','Team & Contract', 'Position', 'ID','Growth','Joined', 'Contract','Name','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Defending', 'Defending'], axis=1)
    
    
    
    #data = data.drop(['Age', 'Height', 'Weight', 'On Loan', 'Value', 'Wage', 'Release Clause'], axis=1)
    #data = data.drop(['Value', 'Wage', 'Release Clause'], axis=1)
    #data = data.drop(['Age', 'Height', 'Weight'], axis=1)

    for column in data.columns: 
        data = data[data[column].isna()==False]

    #Drop the rows with less than 5% of NaN
#     data = data[data['A/W'].isna()==False]
#     data = data[data['D/W'].isna()==False]
    
    return data



In [526]:
df = preprocess(pd.read_csv('fifa21_train.csv'))
df2 = preprocess(pd.read_csv('fifa21_validate.csv'))


### And a function to do the X-Y split

In [527]:
y = df['OVA']
X = df.drop(['OVA'], axis = 1)

#Split numerical and categorical data
X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)


transformer = MinMaxScaler().fit(X_num)
# encoder = OneHotEncoder().fit(X_cat)


def xysplit(dataframe): 
    
    #Initial split (y will remain unchanged, X will undergo more operations)
    y = dataframe['OVA']
    X = dataframe.drop(['OVA'], axis = 1)
    
    #Split numerical and categorical data
    X_num = X.select_dtypes(np.number)
    X_cat = X.select_dtypes(object)

    #Normalize the numerical data
    x_normalized = transformer.transform(X_num)
    X_normal = pd.DataFrame(x_normalized, columns=X_num.columns)

    #Encode categorical data
    encoder = OneHotEncoder().fit(X_cat)

    encoded = encoder.transform(X_cat).toarray()

    # And get relevant headers for the encoded categorical data
    headers = []

    for category in encoder.categories_:
        for unit in category: 
            headers.append(unit)

    categ_encoded=pd.DataFrame(encoded, columns=headers)
    
    #Finally, concatenate the (normalized) numerical and (encoded) categorical data
    X = pd.concat([X_normal, categ_encoded], axis = 1)
    
    return X, y

In [528]:
X, y = xysplit(df)

### Test train split

In [529]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

### Build the model 

In [530]:
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

### A function to generate scores to evaluate

In [531]:
def scores(y_data, predictions):
    print("r2 score:",r2_score(y_data, predictions))
    print("MSE score:",mean_squared_error(y_data, predictions))
    print("RMSE score:",np.sqrt(mean_squared_error(y_data, predictions)))
    print("MAE score:", mean_absolute_error(y_data, predictions))

### Get the R2 score for the training and test data

In [532]:
predictions = lm.predict(X_train)
predictions_test = lm.predict(X_test)

print("Training data")
scores(y_train, predictions)

print("Test data")
scores(y_test, predictions_test)

Training data
r2 score: 0.9204087603462355
MSE score: 3.702127630892997
RMSE score: 1.9240913779997553
MAE score: 1.4901430460456113
Test data
r2 score: 0.9229505531657108
MSE score: 3.7098033095345193
RMSE score: 1.9260849694482638
MAE score: 1.5127089886487965


### Validate new data

In [None]:

def preprocess(df):
    cols = []
    for column in df.columns:
        cols.append(column.lower())
    df.columns = cols
    return df


df = preprocess(pd.read_csv('fifa21_train.csv'))
df.head(3)

Unnamed: 0,id,name,age,nationality,club,bp,position,team & contract,height,weight,foot,growth,joined,loan date end,value,wage,release clause,contract,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
0,184383,A. Pasche,26,Switzerland,FC Lausanne-Sport,CM,CM CDM,FC Lausanne-Sport 2015 ~ 2020,"5'9""",161lbs,Right,1,"Jul 1, 2015",,€525K,€4K,€801K,2015 ~ 2020,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,7,12,14,9,6,1682,357,4 ★,2★,High,Medium,1 ★,69,51,63,63,51,60,3,58+1,58+1,58+1,61+0,62+0,62+0,62+0,61+0,63+1,63+1,63+1,63+1,63+1,63+1,63+1,63+1,59+1,59+1,59+1,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64
1,188044,Alan Carvalho,30,China PR,Beijing Sinobo Guoan FC,ST,ST LW LM,"Beijing Sinobo Guoan FC Dec 31, 2020 On Loan","6'0""",159lbs,Right,0,"Jan 16, 2015","Dec 31, 2020",€8.5M,€23K,€0,"Dec 31, 2020 On Loan",365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,11,7,14,7,16,1961,412,3 ★,4★,High,Low,2 ★,83,75,68,82,33,71,44,77+0,77+0,77+0,77+0,77+0,77+0,77+0,77+0,76+1,76+1,76+1,76+1,68+2,68+2,68+2,76+1,57+2,53+2,53+2,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77
2,184431,S. Giovinco,33,Italy,Al Hilal,CAM,CAM CF,Al Hilal 2019 ~ 2022,"5'4""",134lbs,Right,0,"Jan 31, 2019",,€9M,€49K,€15.3M,2019 ~ 2022,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,6,3,6,3,3,1925,404,4 ★,4★,High,Medium,2 ★,80,77,78,86,27,56,73,73+2,73+2,73+2,80+0,79+0,79+0,79+0,80+0,80+0,80+0,80+0,79+1,74+2,74+2,74+2,79+1,59+2,56+2,56+2,56+2,59+2,53+2,41+2,41+2,41+2,53+2,12+2,80


In [None]:
df2 = preprocess(pd.read_csv('fifa21_validate.csv'))
df2.head()

Unnamed: 0,id,name,age,nationality,club,bp,position,team & contract,height,weight,foot,growth,joined,loan date end,value,wage,release clause,contract,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
0,219461,E. Palmer-Brown,23,United States,FK Austria Wien,CB,CB,"FK Austria Wien Jun 30, 2021 On Loan","6'2""",194lbs,Right,7,"Feb 8, 2018","Jun 30, 2021",€975K,€5K,€0,"Jun 30, 2021 On Loan",230,47,21,62,60,40.0,228,44,43.0,36,51,54,303,60,68,63.0,63,49.0,288,48,77.0,51,87,25,246,68,62.0,38.0,39.0,39,49.0,200,68,67,65.0,56,11,8,15,13,9,1551,334,2 ★,2★,Low,High,1 ★,64,30,50,50,66,74,34,48+2,48+2,48+2,48+0,47+0,47+0,47+0,48+0,48+2,48+2,48+2,50+2,51+2,51+2,51+2,50+2,59+2,61+2,61+2,61+2,59+2,61+2,67+2,67+2,67+2,61+2,16+2,67
1,221896,D. Avdijaj,22,Kosovo,Heart of Midlothian,CAM,LM CAM,Heart of Midlothian 2020 ~ 2020,"5'8""",154lbs,Right,5,"Jan 20, 2020",,€1.2M,€3K,€2.2M,2020 ~ 2020,298,62,60,44,62,70.0,330,76,68.0,56,60,70,375,77,72,83.0,64,79.0,323,76,62.0,63,47,75,286,72,26.0,64.0,64.0,60,65.0,61,19,23,19.0,53,14,13,9,9,8,1726,358,4 ★,3★,High,Low,1 ★,74,67,62,74,24,57,12,64+2,64+2,64+2,68+0,68+0,68+0,68+0,68+0,68+2,68+2,68+2,67+2,61+2,61+2,61+2,67+2,49+2,47+2,47+2,47+2,49+2,45+2,38+2,38+2,38+2,45+2,17+2,68
2,247428,D. Ochoa,19,United States,Real Salt Lake,GK,GK,Real Salt Lake 2018 ~ 2020,"6'2""",176lbs,Right,17,"Nov 28, 2018",,€120K,€500,€249K,2018 ~ 2020,48,7,5,11,21,4.0,52,6,8.0,8,20,10,165,28,25,33.0,41,38.0,171,40,49.0,22,54,6,76,20,9.0,7.0,26.0,14,31.0,27,8,9,10.0,269,56,52,53,53,55,808,295,2 ★,1★,Medium,Medium,1 ★,56,52,53,55,26,53,3,18+2,18+2,18+2,15+0,17+0,17+0,17+0,15+0,17+2,17+2,17+2,16+2,18+2,18+2,18+2,16+2,16+2,18+2,18+2,18+2,16+2,16+2,18+2,18+2,18+2,16+2,53+2,54
3,255120,N. Kenneh,16,England,Leeds United,CDM,CB CDM RB,Leeds United 2020 ~ 2022,"6'3""",170lbs,Right,23,"Jan 10, 2020",,€160K,€500,€464K,2020 ~ 2022,215,38,31,55,59,32.0,224,51,34.0,38,47,54,275,59,58,56.0,48,54.0,242,48,48.0,60,58,28,230,61,55.0,33.0,40.0,41,59.0,159,53,52,54.0,36,7,5,13,5,6,1381,303,3 ★,2★,Medium,Medium,1 ★,58,34,47,52,53,59,6,46+2,46+2,46+2,47+0,46+0,46+0,46+0,47+0,47+2,47+2,47+2,49+2,49+2,49+2,49+2,49+2,53+2,54+2,54+2,54+2,53+2,53+2,54+2,54+2,54+2,53+2,11+2,55
4,215556,E. Fernandes,24,Switzerland,1. FSV Mainz 05,CDM,CM CDM,1. FSV Mainz 05 2019 ~ 2023,"6'2""",170lbs,Right,5,"Jul 1, 2019",,€2.3M,€13K,€4.3M,2019 ~ 2023,295,57,59,45,78,56.0,327,71,57.0,51,74,74,320,68,66,66.0,64,56.0,337,73,56.0,74,72,62,314,66,78.0,53.0,62.0,55,63.0,211,72,68,71.0,60,12,7,13,15,13,1864,407,4 ★,2★,Medium,Medium,1 ★,67,62,68,70,69,71,45,63+2,63+2,63+2,66+0,66+0,66+0,66+0,66+0,68+2,68+2,68+2,67+2,70+2,70+2,70+2,67+2,70+2,72+2,72+2,72+2,70+2,69+2,68+2,68+2,68+2,69+2,18+2,70


In [None]:
df.shape, df2.shape

((11701, 101), (1999, 101))

In [None]:
df = df.drop(['age', 'nationality', 'position', 'club', 'team & contract', 'height', 'weight', 'foot', 'growth', 'joined', 'loan date end', 'value', 'wage', 'release clause', 'contract'], axis=1)
df2 = df2.drop(['age', 'nationality', 'position', 'club', 'team & contract', 'height', 'weight', 'foot', 'growth', 'joined', 'loan date end', 'value', 'wage', 'release clause', 'contract'], axis=1)

In [None]:
# df = df.reset_index()

In [None]:
identity = df.loc[:,['id','name']]
df = df.drop(['id', 'name'], axis=1)

identity2 = df2.loc[:,['id','name']]
df2 = df2.drop(['id', 'name'], axis=1)

In [None]:
identity.head(3)

Unnamed: 0,id,name
0,184383,A. Pasche
1,188044,Alan Carvalho
2,184431,S. Giovinco


In [None]:
identity2.head(3)

Unnamed: 0,id,name
0,219461,E. Palmer-Brown
1,221896,D. Avdijaj
2,247428,D. Ochoa


In [None]:
newlist = []
for x in df['ls']:
        if '+' in x:
            x = int(x.split('+')[0]) + int(x.split('+')[1])
            newlist.append(x)
        elif '-' in x:
            x = int(x.split('-')[0]) - int(x.split('-')[1])
            newlist.append(x)
        else:
            x = x

df['ls'] = pd.DataFrame (newlist, columns = ['ls'])

In [None]:
df.head()

Unnamed: 0,bp,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,a/w,d/w,ir,pac,sho,pas,dri,def,phy,hits,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,ova
0,CM,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,7,12,14,9,6,1682,357,4 ★,2★,High,Medium,1 ★,69,51,63,63,51,60,3,59,58+1,58+1,61+0,62+0,62+0,62+0,61+0,63+1,63+1,63+1,63+1,63+1,63+1,63+1,63+1,59+1,59+1,59+1,59+1,59+1,58+1,54+1,54+1,54+1,58+1,15+1,64
1,ST,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,11,7,14,7,16,1961,412,3 ★,4★,High,Low,2 ★,83,75,68,82,33,71,44,77,77+0,77+0,77+0,77+0,77+0,77+0,77+0,76+1,76+1,76+1,76+1,68+2,68+2,68+2,76+1,57+2,53+2,53+2,53+2,57+2,53+2,48+2,48+2,48+2,53+2,18+2,77
2,CAM,336,73,76,34,78,75.0,424,85,89.0,91,74,85,424,84,76,93.0,78,93.0,308,79,34.0,75,42,78,332,75,26.0,80.0,78.0,73,82.0,80,23,29,28.0,21,6,3,6,3,3,1925,404,4 ★,4★,High,Medium,2 ★,80,77,78,86,27,56,73,75,73+2,73+2,80+0,79+0,79+0,79+0,80+0,80+0,80+0,80+0,79+1,74+2,74+2,74+2,79+1,59+2,56+2,56+2,56+2,59+2,53+2,41+2,41+2,41+2,53+2,12+2,80
3,CDM,242,44,42,58,62,36.0,259,54,41.0,46,57,61,282,54,59,59.0,55,55.0,277,57,60.0,64,58,38,257,61,57.0,31.0,54.0,54,48.0,168,55,58,55.0,42,8,9,6,7,12,1527,329,2 ★,2★,Medium,Medium,1 ★,57,44,54,57,57,60,7,52,50+2,50+2,51+0,51+0,51+0,51+0,51+0,53+2,53+2,53+2,53+2,56+2,56+2,56+2,53+2,56+2,58+2,58+2,58+2,56+2,57+2,58+2,58+2,58+2,57+2,14+2,59
4,CDM,249,49,37,61,68,34.0,280,64,44.0,45,61,66,324,66,66,61.0,62,69.0,280,61,34.0,81,61,43,294,66,60.0,55.0,64.0,49,58.0,185,58,61,66.0,52,8,9,15,5,15,1664,360,2 ★,3★,Low,Medium,1 ★,66,44,60,64,60,66,4,58,56+2,56+2,59+0,59+0,59+0,59+0,59+0,61+2,61+2,61+2,62+2,63+2,63+2,63+2,62+2,64+2,64+2,64+2,64+2,64+2,63+2,61+2,61+2,61+2,63+2,15+2,65


In [None]:
# df.isna().sum()

In [None]:
# df = df[df['position'].isna()==False]
df = df[df['composure'].isna()==False]
df2 = df2[df2['composure'].isna()==False]

df.shape

(11422, 84)

In [None]:
# df.isna().sum()

In [None]:
# round(df.isna().sum()/len(df),4)*100  # shows the percentage of null values in a column
# nulls_df = pd.DataFrame(round(df.isna().sum()/len(df),4)*100)
# nulls_df
# nulls_df = nulls_df.reset_index()
# nulls_df
# nulls_df.columns = ['header_name', 'percent_nulls']
# nulls_df

In [None]:
# df.columns

In [None]:
# df.dtypes

In [None]:
df['w/f'] = df['w/f'].str.replace('★','').astype(np.int64)
df['sm'] = df['sm'].str.replace('★','').astype(np.int64)
df['ir'] = df['ir'].str.replace('★','').astype(np.int64)

df2['w/f'] = df2['w/f'].str.replace('★','').astype(np.int64)
df2['sm'] = df2['sm'].str.replace('★','').astype(np.int64)
df2['ir'] = df2['ir'].str.replace('★','').astype(np.int64)



In [None]:
df['hits'] = df['hits'].str.replace('.1K','100')
df['hits'] = df['hits'].str.replace('.2K','200')
df['hits'] = df['hits'].str.replace('.3K','300')
df['hits'] = df['hits'].str.replace('.4K','300')
df['hits'] = df['hits'].str.replace('.5K','500')
df['hits'] = df['hits'].str.replace('.6K','600')
df['hits'] = df['hits'].str.replace('.7K','700')
df['hits'] = df['hits'].str.replace('.8K','800')
df['hits'] = df['hits'].str.replace('.9K','900')
df['hits'] = df['hits'].str.replace('K','000')
df['hits'] = pd.to_numeric(df['hits'])

df2['hits'] = df2['hits'].str.replace('.1K','100')
df2['hits'] = df2['hits'].str.replace('.2K','200')
df2['hits'] = df2['hits'].str.replace('.3K','300')
df2['hits'] = df2['hits'].str.replace('.4K','300')
df2['hits'] = df2['hits'].str.replace('.5K','500')
df2['hits'] = df2['hits'].str.replace('.6K','600')
df2['hits'] = df2['hits'].str.replace('.7K','700')
df2['hits'] = df2['hits'].str.replace('.8K','800')
df2['hits'] = df2['hits'].str.replace('.9K','900')
df2['hits'] = df2['hits'].str.replace('K','000')
df2['hits'] = pd.to_numeric(df2['hits'])

  df['hits'] = df['hits'].str.replace('.1K','100')
  df['hits'] = df['hits'].str.replace('.2K','200')
  df['hits'] = df['hits'].str.replace('.3K','300')
  df['hits'] = df['hits'].str.replace('.4K','300')
  df['hits'] = df['hits'].str.replace('.5K','500')
  df['hits'] = df['hits'].str.replace('.6K','600')
  df['hits'] = df['hits'].str.replace('.7K','700')
  df['hits'] = df['hits'].str.replace('.8K','800')
  df['hits'] = df['hits'].str.replace('.9K','900')
  df2['hits'] = df2['hits'].str.replace('.1K','100')
  df2['hits'] = df2['hits'].str.replace('.2K','200')
  df2['hits'] = df2['hits'].str.replace('.3K','300')
  df2['hits'] = df2['hits'].str.replace('.4K','300')
  df2['hits'] = df2['hits'].str.replace('.5K','500')
  df2['hits'] = df2['hits'].str.replace('.6K','600')
  df2['hits'] = df2['hits'].str.replace('.7K','700')
  df2['hits'] = df2['hits'].str.replace('.8K','800')
  df2['hits'] = df2['hits'].str.replace('.9K','900')


In [None]:
X_num = df.select_dtypes(include = np.number)
X_cat = df.select_dtypes(include = object)

X_num2 = df2.select_dtypes(include = np.number)
X_cat2 = df2.select_dtypes(include = object)

In [None]:
print(X_num.shape)
X_num.head(2)

(11422, 55)


Unnamed: 0,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ls,ova
0,258,54,47,43,70,44.0,286,61,44.0,55,63,63,346,64,73,61.0,66,82.0,306,62,73.0,71,55,45,290,54,52.0,62.0,68.0,54,54.0,148,49,56,43.0,48,7,12,14,9,6,1682,357,4,2,1,69,51,63,63,51,60,3,59,64
1,365,66,79,76,68,76.0,375,83,78.0,72,63,79,404,83,83,88.0,75,75.0,372,74,81.0,75,74,68,313,54,33.0,78.0,72.0,76,70.0,77,35,20,22.0,55,11,7,14,7,16,1961,412,3,4,2,83,75,68,82,33,71,44,77,77


In [None]:
print(X_num2.shape)
X_num2.head(2)

(1949, 54)


Unnamed: 0,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ova
0,230,47,21,62,60,40.0,228,44,43.0,36,51,54,303,60,68,63.0,63,49.0,288,48,77.0,51,87,25,246,68,62.0,38.0,39.0,39,49.0,200,68,67,65.0,56,11,8,15,13,9,1551,334,2,2,1,64,30,50,50,66,74,34,67
1,298,62,60,44,62,70.0,330,76,68.0,56,60,70,375,77,72,83.0,64,79.0,323,76,62.0,63,47,75,286,72,26.0,64.0,64.0,60,65.0,61,19,23,19.0,53,14,13,9,9,8,1726,358,4,3,1,74,67,62,74,24,57,12,68


In [None]:
X_cat2string = X_cat.drop([])
X_cat = X_cat.drop(['st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk'],axis=1)
X_cat2 = X_cat2.drop(['st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk'],axis=1)

X_cat.head()

Unnamed: 0,bp,a/w,d/w
0,CM,High,Medium
1,ST,High,Low
2,CAM,High,Medium
3,CDM,Medium,Medium
4,CDM,Low,Medium


### Normalizing numerical data

In [None]:
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_num)
X_num_normalized = MinMaxtransformer.transform(X_num)
print(X_num_normalized.shape)
X_num_normalized = pd.DataFrame(X_num_normalized,columns=X_num.columns)
X_num_normalized.head()

(11422, 55)


Unnamed: 0,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ls,ova
0,0.546835,0.545455,0.478261,0.431818,0.710843,0.465116,0.569087,0.615385,0.444444,0.561798,0.642857,0.637363,0.663818,0.607143,0.729412,0.573171,0.591549,0.8125,0.55627,0.60241,0.685714,0.694118,0.454545,0.455556,0.654596,0.517241,0.552941,0.645161,0.682353,0.547619,0.5,0.518219,0.516854,0.595238,0.440476,0.084309,0.056818,0.116279,0.139535,0.078652,0.045455,0.604577,0.492366,0.75,0.25,0.0,0.614286,0.447368,0.558824,0.522388,0.506494,0.515625,0.0,0.565789,0.395833
1,0.817722,0.681818,0.826087,0.806818,0.686747,0.837209,0.777518,0.857143,0.822222,0.752809,0.642857,0.813187,0.82906,0.833333,0.847059,0.902439,0.71831,0.725,0.768489,0.746988,0.8,0.741176,0.701299,0.711111,0.718663,0.517241,0.329412,0.817204,0.729412,0.809524,0.690476,0.230769,0.359551,0.166667,0.190476,0.100703,0.102273,0.05814,0.139535,0.05618,0.159091,0.781945,0.70229,0.5,0.75,0.25,0.814286,0.763158,0.632353,0.80597,0.272727,0.6875,0.004458,0.802632,0.666667
2,0.744304,0.761364,0.793478,0.329545,0.807229,0.825581,0.892272,0.879121,0.944444,0.966292,0.77381,0.879121,0.88604,0.845238,0.764706,0.963415,0.760563,0.95,0.562701,0.807229,0.128571,0.741176,0.285714,0.822222,0.771588,0.758621,0.247059,0.83871,0.8,0.77381,0.833333,0.242915,0.224719,0.27381,0.261905,0.021077,0.045455,0.011628,0.046512,0.011236,0.011364,0.759059,0.671756,0.75,0.75,0.25,0.771429,0.789474,0.779412,0.865672,0.194805,0.453125,0.007611,0.776316,0.729167
3,0.506329,0.431818,0.423913,0.602273,0.614458,0.372093,0.505855,0.538462,0.411111,0.460674,0.571429,0.615385,0.481481,0.488095,0.564706,0.54878,0.43662,0.475,0.463023,0.542169,0.5,0.611765,0.493506,0.377778,0.562674,0.597701,0.611765,0.311828,0.517647,0.547619,0.428571,0.59919,0.58427,0.619048,0.583333,0.070258,0.068182,0.081395,0.046512,0.05618,0.113636,0.506039,0.385496,0.25,0.25,0.0,0.442857,0.355263,0.426471,0.432836,0.584416,0.515625,0.000435,0.473684,0.291667
4,0.524051,0.488636,0.369565,0.636364,0.686747,0.348837,0.555035,0.648352,0.444444,0.449438,0.619048,0.67033,0.60114,0.630952,0.647059,0.573171,0.535211,0.65,0.472669,0.590361,0.128571,0.811765,0.532468,0.433333,0.665738,0.655172,0.647059,0.569892,0.635294,0.488095,0.547619,0.668016,0.617978,0.654762,0.714286,0.093677,0.068182,0.081395,0.151163,0.033708,0.147727,0.593134,0.503817,0.25,0.5,0.0,0.571429,0.355263,0.514706,0.537313,0.623377,0.609375,0.000109,0.552632,0.416667


In [None]:
from sklearn.preprocessing import MinMaxScaler
MinMaxtransformer = MinMaxScaler().fit(X_num2)
X_num2_normalized = MinMaxtransformer.transform(X_num2)
print(X_num2_normalized.shape)
X_num2_normalized = pd.DataFrame(X_num2_normalized,columns=X_num2.columns)
X_num2_normalized.head()

(1949, 54)


Unnamed: 0,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ova
0,0.509804,0.493976,0.195402,0.635294,0.604938,0.45,0.463158,0.448276,0.430233,0.337209,0.518987,0.54321,0.535503,0.578313,0.679012,0.592105,0.52459,0.386667,0.541219,0.442857,0.774648,0.481928,0.911765,0.247059,0.517906,0.690476,0.690476,0.390805,0.358025,0.376471,0.4625,0.708502,0.707865,0.719512,0.695122,0.097156,0.095238,0.066667,0.152174,0.125,0.079545,0.526038,0.33871,0.25,0.25,0.0,0.53125,0.138889,0.328125,0.327869,0.688312,0.719298,0.010344,0.44186
1,0.70028,0.674699,0.643678,0.423529,0.62963,0.825,0.731579,0.816092,0.72093,0.569767,0.632911,0.740741,0.748521,0.783133,0.728395,0.855263,0.540984,0.786667,0.666667,0.842857,0.56338,0.626506,0.323529,0.835294,0.628099,0.738095,0.261905,0.689655,0.666667,0.623529,0.6625,0.145749,0.157303,0.182927,0.134146,0.090047,0.130952,0.122222,0.086957,0.079545,0.068182,0.641397,0.435484,0.75,0.5,0.0,0.6875,0.652778,0.515625,0.721311,0.142857,0.421053,0.003003,0.465116
2,0.0,0.012048,0.011494,0.035294,0.123457,0.0,0.0,0.011494,0.023256,0.011628,0.126582,0.0,0.127219,0.192771,0.148148,0.197368,0.163934,0.24,0.121864,0.328571,0.380282,0.13253,0.426471,0.023529,0.049587,0.119048,0.059524,0.034483,0.197531,0.082353,0.2375,0.008097,0.033708,0.012195,0.02439,0.601896,0.630952,0.555556,0.565217,0.579545,0.602273,0.036256,0.181452,0.25,0.0,0.0,0.40625,0.444444,0.375,0.409836,0.168831,0.350877,0.0,0.139535
3,0.467787,0.385542,0.310345,0.552941,0.592593,0.35,0.452632,0.528736,0.325581,0.360465,0.468354,0.54321,0.452663,0.566265,0.555556,0.5,0.278689,0.453333,0.376344,0.442857,0.366197,0.590361,0.485294,0.282353,0.473829,0.607143,0.607143,0.333333,0.37037,0.4,0.5875,0.54251,0.539326,0.536585,0.560976,0.049763,0.047619,0.033333,0.130435,0.034091,0.045455,0.413975,0.21371,0.5,0.25,0.0,0.4375,0.194444,0.28125,0.360656,0.519481,0.45614,0.001001,0.162791
4,0.691877,0.614458,0.632184,0.435294,0.82716,0.65,0.723684,0.758621,0.593023,0.511628,0.810127,0.790123,0.585799,0.674699,0.654321,0.631579,0.540984,0.48,0.716846,0.8,0.478873,0.759036,0.691176,0.682353,0.705234,0.666667,0.880952,0.563218,0.641975,0.564706,0.6375,0.753036,0.752809,0.731707,0.768293,0.106635,0.107143,0.055556,0.130435,0.147727,0.125,0.732367,0.633065,0.75,0.25,0.0,0.578125,0.583333,0.609375,0.655738,0.727273,0.666667,0.014014,0.511628


In [None]:
X_cat.head()

Unnamed: 0,bp,a/w,d/w
0,CM,High,Medium
1,ST,High,Low
2,CAM,High,Medium
3,CDM,Medium,Medium
4,CDM,Low,Medium


In [None]:
X_cat2.head()

Unnamed: 0,bp,a/w,d/w,ls
0,CB,Low,High,48+2
1,CAM,High,Low,64+2
2,GK,Medium,Medium,18+2
3,CDM,Medium,Medium,46+2
4,CDM,Medium,Medium,63+2


### Encoding categorical variables

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(X_cat)
encoded = encoder.transform(X_cat).toarray()
encoded
cols = encoder.get_feature_names_out(input_features=X_cat.columns)
cols
onehot_encoded = pd.DataFrame(encoded, columns=cols)
onehot_encoded.head()

Unnamed: 0,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder2 = OneHotEncoder(drop='first').fit(X_cat2)
encoded2 = encoder2.transform(X_cat2).toarray()
encoded2
cols2 = encoder2.get_feature_names_out(input_features=X_cat2.columns)
cols2
onehot_encoded2 = pd.DataFrame(encoded2, columns=cols)
onehot_encoded2.head()

Unnamed: 0,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium,ls_17+2,ls_18+2,ls_19+2,ls_20+2,ls_21+2,ls_22+1,ls_22+2,ls_23+2,ls_24+2,ls_25+2,ls_26+1,ls_26+2,ls_26+3,ls_27+2,ls_28+1,ls_28+2,ls_29+2,ls_29+3,ls_30+2,ls_31+2,ls_31+3,ls_32+2,ls_32+3,ls_33+2,ls_33+3,ls_34+2,ls_35+2,ls_35+3,ls_36+2,ls_37+2,ls_38+2,ls_39+2,ls_40+2,ls_41+2,ls_42+1,ls_42+2,ls_43+1,ls_43+2,ls_44+1,ls_44+2,ls_45+2,ls_46+1,ls_46+2,ls_47+2,ls_48+1,ls_48+2,ls_49+1,ls_49+2,ls_50+1,ls_50+2,ls_51+1,ls_51+2,ls_52+1,ls_52+2,ls_53+1,ls_53+2,ls_54+2,ls_55+1,ls_55+2,ls_56+1,ls_56+2,ls_57+2,ls_57+3,ls_58+1,ls_58+2,ls_58+3,ls_59+1,ls_59+2,ls_60+0,ls_60+1,ls_60+2,ls_61+0,ls_61+1,ls_61+2,ls_61+3,ls_62+0,ls_62+1,ls_62+2,ls_63+0,ls_63+1,ls_63+2,ls_64+0,ls_64+1,ls_64+2,ls_64+3,ls_65+0,ls_65+1,ls_65+2,ls_65+3,ls_66+0,ls_66+1,ls_66+2,ls_66+3,ls_67+0,ls_67+1,ls_67+2,ls_68+-5,ls_68+0,ls_68+1,ls_68+2,ls_68+3,ls_69+-1,ls_69+0,ls_69+1,ls_69+2,ls_69+3,ls_70+0,ls_70+1,ls_70+2,ls_71+0,ls_71+1,ls_71+2,ls_71+3,ls_72+-1,ls_72+0,ls_72+1,ls_72+2,ls_72+3,ls_73+0,ls_73+1,ls_73+2,ls_73+3,ls_74+0,ls_74+1,ls_74+2,ls_74+3,ls_75+0,ls_75+1,ls_75+2,ls_75+3,ls_76+0,ls_76+2,ls_76+3,ls_77+0,ls_77+1,ls_77+2,ls_77+3,ls_78+0,ls_78+1,ls_78+2,ls_78+3,ls_79+0,ls_79+1,ls_79+2,ls_79+3,ls_80+0,ls_80+1,ls_80+2,ls_80+3,ls_81+2,ls_83+1,ls_85+3
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
onehot_encoded.shape, onehot_encoded2.shape 

((11422, 18), (1949, 170))

In [None]:
# corr = X_num_normalized.corr().loc[:,['ova']]
# fig, ax = plt.subplots(figsize = (5,15))

# ax = sns.heatmap(corr.sort_values(by=['ova'], ascending = False), ax = ax, annot = True, cbar = True, cmap='Reds')
# ax.set_xticklabels(ax.xaxis.get_ticklabels(), fontsize = 10)
# ax.set_yticklabels(ax.yaxis.get_ticklabels(), fontsize = 10)

### X-y split

In [None]:
X2_normalized = pd.concat([X_num2_normalized, onehot_encoded2], axis=1)

X_normalized = pd.concat([X_num_normalized, onehot_encoded], axis=1)
X_normalized.shape

(11422, 73)

In [None]:
X2_normalized.shape()

TypeError: 'tuple' object is not callable

In [None]:
X2_normalized.head(2)

Unnamed: 0,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ls,ova,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium
0,0.546835,0.545455,0.478261,0.431818,0.710843,0.465116,0.569087,0.615385,0.444444,0.561798,0.642857,0.637363,0.663818,0.607143,0.729412,0.573171,0.591549,0.8125,0.55627,0.60241,0.685714,0.694118,0.454545,0.455556,0.654596,0.517241,0.552941,0.645161,0.682353,0.547619,0.5,0.518219,0.516854,0.595238,0.440476,0.084309,0.056818,0.116279,0.139535,0.078652,0.045455,0.604577,0.492366,0.75,0.25,0.0,0.614286,0.447368,0.558824,0.522388,0.506494,0.515625,0.0,0.565789,0.395833,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.817722,0.681818,0.826087,0.806818,0.686747,0.837209,0.777518,0.857143,0.822222,0.752809,0.642857,0.813187,0.82906,0.833333,0.847059,0.902439,0.71831,0.725,0.768489,0.746988,0.8,0.741176,0.701299,0.711111,0.718663,0.517241,0.329412,0.817204,0.729412,0.809524,0.690476,0.230769,0.359551,0.166667,0.190476,0.100703,0.102273,0.05814,0.139535,0.05618,0.159091,0.781945,0.70229,0.5,0.75,0.25,0.814286,0.763158,0.632353,0.80597,0.272727,0.6875,0.004458,0.802632,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
X_normalized.head(2)

Unnamed: 0,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ls,ova,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium
0,0.546835,0.545455,0.478261,0.431818,0.710843,0.465116,0.569087,0.615385,0.444444,0.561798,0.642857,0.637363,0.663818,0.607143,0.729412,0.573171,0.591549,0.8125,0.55627,0.60241,0.685714,0.694118,0.454545,0.455556,0.654596,0.517241,0.552941,0.645161,0.682353,0.547619,0.5,0.518219,0.516854,0.595238,0.440476,0.084309,0.056818,0.116279,0.139535,0.078652,0.045455,0.604577,0.492366,0.75,0.25,0.0,0.614286,0.447368,0.558824,0.522388,0.506494,0.515625,0.0,0.565789,0.395833,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.817722,0.681818,0.826087,0.806818,0.686747,0.837209,0.777518,0.857143,0.822222,0.752809,0.642857,0.813187,0.82906,0.833333,0.847059,0.902439,0.71831,0.725,0.768489,0.746988,0.8,0.741176,0.701299,0.711111,0.718663,0.517241,0.329412,0.817204,0.729412,0.809524,0.690476,0.230769,0.359551,0.166667,0.190476,0.100703,0.102273,0.05814,0.139535,0.05618,0.159091,0.781945,0.70229,0.5,0.75,0.25,0.814286,0.763158,0.632353,0.80597,0.272727,0.6875,0.004458,0.802632,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
y = df['ova']
y2 = df2['ova']

# X = X.drop(['total claim amount'], axis=1)
X_normalized = X_normalized.drop(['ova'], axis=1)
X_ID = pd.concat([identity, X_normalized], axis=1)
X2_ID = pd.concat([identity2, X2_normalized], axis=1)

In [None]:
# display(y)
# display(y2)

In [None]:
X_ID.head()

Unnamed: 0,id,name,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ls,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium
0,184383,A. Pasche,0.546835,0.545455,0.478261,0.431818,0.710843,0.465116,0.569087,0.615385,0.444444,0.561798,0.642857,0.637363,0.663818,0.607143,0.729412,0.573171,0.591549,0.8125,0.55627,0.60241,0.685714,0.694118,0.454545,0.455556,0.654596,0.517241,0.552941,0.645161,0.682353,0.547619,0.5,0.518219,0.516854,0.595238,0.440476,0.084309,0.056818,0.116279,0.139535,0.078652,0.045455,0.604577,0.492366,0.75,0.25,0.0,0.614286,0.447368,0.558824,0.522388,0.506494,0.515625,0.0,0.565789,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,188044,Alan Carvalho,0.817722,0.681818,0.826087,0.806818,0.686747,0.837209,0.777518,0.857143,0.822222,0.752809,0.642857,0.813187,0.82906,0.833333,0.847059,0.902439,0.71831,0.725,0.768489,0.746988,0.8,0.741176,0.701299,0.711111,0.718663,0.517241,0.329412,0.817204,0.729412,0.809524,0.690476,0.230769,0.359551,0.166667,0.190476,0.100703,0.102273,0.05814,0.139535,0.05618,0.159091,0.781945,0.70229,0.5,0.75,0.25,0.814286,0.763158,0.632353,0.80597,0.272727,0.6875,0.004458,0.802632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,184431,S. Giovinco,0.744304,0.761364,0.793478,0.329545,0.807229,0.825581,0.892272,0.879121,0.944444,0.966292,0.77381,0.879121,0.88604,0.845238,0.764706,0.963415,0.760563,0.95,0.562701,0.807229,0.128571,0.741176,0.285714,0.822222,0.771588,0.758621,0.247059,0.83871,0.8,0.77381,0.833333,0.242915,0.224719,0.27381,0.261905,0.021077,0.045455,0.011628,0.046512,0.011236,0.011364,0.759059,0.671756,0.75,0.75,0.25,0.771429,0.789474,0.779412,0.865672,0.194805,0.453125,0.007611,0.776316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,233796,J. Evans,0.506329,0.431818,0.423913,0.602273,0.614458,0.372093,0.505855,0.538462,0.411111,0.460674,0.571429,0.615385,0.481481,0.488095,0.564706,0.54878,0.43662,0.475,0.463023,0.542169,0.5,0.611765,0.493506,0.377778,0.562674,0.597701,0.611765,0.311828,0.517647,0.547619,0.428571,0.59919,0.58427,0.619048,0.583333,0.070258,0.068182,0.081395,0.046512,0.05618,0.113636,0.506039,0.385496,0.25,0.25,0.0,0.442857,0.355263,0.426471,0.432836,0.584416,0.515625,0.000435,0.473684,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,234799,Y. Demoncy,0.524051,0.488636,0.369565,0.636364,0.686747,0.348837,0.555035,0.648352,0.444444,0.449438,0.619048,0.67033,0.60114,0.630952,0.647059,0.573171,0.535211,0.65,0.472669,0.590361,0.128571,0.811765,0.532468,0.433333,0.665738,0.655172,0.647059,0.569892,0.635294,0.488095,0.547619,0.668016,0.617978,0.654762,0.714286,0.093677,0.068182,0.081395,0.151163,0.033708,0.147727,0.593134,0.503817,0.25,0.5,0.0,0.571429,0.355263,0.514706,0.537313,0.623377,0.609375,0.000109,0.552632,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
X2_ID.head()

Unnamed: 0,id,name,attacking,crossing,finishing,heading accuracy,short passing,volleys,skill,dribbling,curve,fk accuracy,long passing,ball control,movement,acceleration,sprint speed,agility,reactions,balance,power,shot power,jumping,stamina,strength,long shots,mentality,aggression,interceptions,positioning,vision,penalties,composure,defending,marking,standing tackle,sliding tackle,goalkeeping,gk diving,gk handling,gk kicking,gk positioning,gk reflexes,total stats,base stats,w/f,sm,ir,pac,sho,pas,dri,def,phy,hits,ls,ova,bp_CB,bp_CDM,bp_CF,bp_CM,bp_GK,bp_LB,bp_LM,bp_LW,bp_LWB,bp_RB,bp_RM,bp_RW,bp_RWB,bp_ST,a/w_Low,a/w_Medium,d/w_Low,d/w_Medium
0,219461.0,E. Palmer-Brown,0.546835,0.545455,0.478261,0.431818,0.710843,0.465116,0.569087,0.615385,0.444444,0.561798,0.642857,0.637363,0.663818,0.607143,0.729412,0.573171,0.591549,0.8125,0.55627,0.60241,0.685714,0.694118,0.454545,0.455556,0.654596,0.517241,0.552941,0.645161,0.682353,0.547619,0.5,0.518219,0.516854,0.595238,0.440476,0.084309,0.056818,0.116279,0.139535,0.078652,0.045455,0.604577,0.492366,0.75,0.25,0.0,0.614286,0.447368,0.558824,0.522388,0.506494,0.515625,0.0,0.565789,0.395833,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,221896.0,D. Avdijaj,0.817722,0.681818,0.826087,0.806818,0.686747,0.837209,0.777518,0.857143,0.822222,0.752809,0.642857,0.813187,0.82906,0.833333,0.847059,0.902439,0.71831,0.725,0.768489,0.746988,0.8,0.741176,0.701299,0.711111,0.718663,0.517241,0.329412,0.817204,0.729412,0.809524,0.690476,0.230769,0.359551,0.166667,0.190476,0.100703,0.102273,0.05814,0.139535,0.05618,0.159091,0.781945,0.70229,0.5,0.75,0.25,0.814286,0.763158,0.632353,0.80597,0.272727,0.6875,0.004458,0.802632,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,247428.0,D. Ochoa,0.744304,0.761364,0.793478,0.329545,0.807229,0.825581,0.892272,0.879121,0.944444,0.966292,0.77381,0.879121,0.88604,0.845238,0.764706,0.963415,0.760563,0.95,0.562701,0.807229,0.128571,0.741176,0.285714,0.822222,0.771588,0.758621,0.247059,0.83871,0.8,0.77381,0.833333,0.242915,0.224719,0.27381,0.261905,0.021077,0.045455,0.011628,0.046512,0.011236,0.011364,0.759059,0.671756,0.75,0.75,0.25,0.771429,0.789474,0.779412,0.865672,0.194805,0.453125,0.007611,0.776316,0.729167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,255120.0,N. Kenneh,0.506329,0.431818,0.423913,0.602273,0.614458,0.372093,0.505855,0.538462,0.411111,0.460674,0.571429,0.615385,0.481481,0.488095,0.564706,0.54878,0.43662,0.475,0.463023,0.542169,0.5,0.611765,0.493506,0.377778,0.562674,0.597701,0.611765,0.311828,0.517647,0.547619,0.428571,0.59919,0.58427,0.619048,0.583333,0.070258,0.068182,0.081395,0.046512,0.05618,0.113636,0.506039,0.385496,0.25,0.25,0.0,0.442857,0.355263,0.426471,0.432836,0.584416,0.515625,0.000435,0.473684,0.291667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,215556.0,E. Fernandes,0.524051,0.488636,0.369565,0.636364,0.686747,0.348837,0.555035,0.648352,0.444444,0.449438,0.619048,0.67033,0.60114,0.630952,0.647059,0.573171,0.535211,0.65,0.472669,0.590361,0.128571,0.811765,0.532468,0.433333,0.665738,0.655172,0.647059,0.569892,0.635294,0.488095,0.547619,0.668016,0.617978,0.654762,0.714286,0.093677,0.068182,0.081395,0.151163,0.033708,0.147727,0.593134,0.503817,0.25,0.5,0.0,0.571429,0.355263,0.514706,0.537313,0.623377,0.609375,0.000109,0.552632,0.416667,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
X_cat.head()

Unnamed: 0,bp,a/w,d/w
0,CM,High,Medium
1,ST,High,Low
2,CAM,High,Medium
3,CDM,Medium,Medium
4,CDM,Low,Medium


### Creating the model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_normalized, y2, test_size=0.2, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [11422, 1949]

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
lm = linear_model.LinearRegression()

### Training the model

In [None]:
lm.fit(X_train,y_train)

LinearRegression()

### Predictions

In [None]:
from sklearn.metrics import r2_score
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

0.9155397414848581

In [None]:
predictions2 = lm.predict(X_train)
r2_score(y_train, predictions2)

0.9155397414848581

In [None]:
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

0.9063807888747071

In [None]:
predictions_test2 = lm.predict(X_test)
r2_score(y_test, predictions_test2)

0.9063807888747071

In [None]:
y_test[:5]

4971     65
10174    66
5002     60
5273     63
2646     67
Name: ova, dtype: int64

In [None]:
predictions_test[:5]

array([63.73838808, 67.47281366, 59.75920609, 64.00324327, 65.93124946])

### Errors

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mse=mean_squared_error(y_test,predictions_test)
mse

4.385418465268338

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse

2.0941390749585707

In [None]:
mae = mean_absolute_error(y_test, predictions_test)
mae

1.6075909356974603

### Looking for the best player of the league

In [None]:
id_max = np.argmax(lm.predict(X_test)) # find id of the maximum predicted label
print(X_ID.loc[id_max])

id                    190782
name                  Sandro
attacking           0.701266
crossing            0.840909
finishing           0.565217
heading accuracy    0.556818
short passing       0.746988
volleys             0.616279
skill               0.723653
dribbling           0.758242
curve               0.777778
fk accuracy         0.617978
long passing         0.72619
ball control        0.758242
movement            0.834758
acceleration        0.892857
sprint speed        0.752941
agility             0.865854
reactions           0.690141
balance                 0.85
power                0.66881
shot power          0.650602
jumping             0.757143
stamina             0.776471
strength            0.584416
long shots          0.555556
mentality           0.821727
aggression          0.747126
interceptions       0.788235
positioning         0.741935
vision              0.752941
penalties           0.607143
composure           0.678571
defending           0.785425
marking       