In [652]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import numpy as np
import statistics as stats
import sklearn
import re
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', None)

In [653]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [654]:
def preprocess(data):

    # Fix the weights (as kg)
    Weight = [] 
    for i in data['Weight']:
        Weight.append(str(i).replace('lbs' , ''))

    data['Weight'] = Weight
    data['Weight'] = data['Weight'].astype(np.number)
    data['Weight'] = data['Weight']/2.2046

    
    # Fix the heights (as cm)
    

    cmheight = []
    for item in data['Height']:
        item = str(item).replace('\"', '')
        feetinches = item.split('\'')
        cmheight.append(round(int(feetinches[0]) * 30.48) + int(feetinches[1]) * 2.54)
    data['Height'] = cmheight
        
    # Remove the stars from columns 'IR', 'W/F', and 'SM'
    newvalues = []
    for item in data['IR']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['IR']=newvalues

    newvalues = []
    for item in data['W/F']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['W/F']=newvalues
    
    newvalues = []
    for item in data['SM']:
        item = str(item).replace('★', '')
        item = item.strip()
        newvalues.append(int(item))
    data['SM']=newvalues
    

    # Transform the 'Contract' column to only give the end date of the contract 
#     contractend = []
#     for item in data['Contract']:
#         if 'Free' in item:
#             contractend.append('DELETE')
#         else:
#             item = item.replace(' On Loan', '')
#             item = item[-4:]
#             contractend.append(int(item))
        
#     data['Contract']=contractend

    # Transform the 'loan date end' column to an on loan (yes/no) column
    data['Loan Date End'] = data['Loan Date End'].fillna(0)

    loanyesno = []
    for item in data['Loan Date End']:
        if item == 0:
            loanyesno.append(item)
        else: 
            loanyesno.append(1)

    data['Loan Date End'] = loanyesno
    data = data.rename(columns={'Loan Date End':'On Loan'})

    data['On Loan']
    
    # Remove the plusses and minuses on the last few rows   
    toclean = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK']

    superlist = []

    for column in toclean: 
        values = []
        for item in data[column]:
            if item[1] == '-' or item[1] == '+':
                values.append(int(item[0]))
            else: 
                values.append(int(item[:2]))
        superlist.append(values)


    for i in range(len(toclean)):
        data[toclean[i]] = superlist[i]

    # Clean up currency cells by removing € sign and converting 'M' and 'K' to numbers
    
    newvalues1 = []
    for item in data['Value']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item = item * 1000
            newvalues1.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item = item * 1000000      
            newvalues1.append(int(item))
        else: 
            item = int(item)
            newvalues1.append(int(item))
    data['Value']=newvalues1
    
    
    newvalues2 = []
    for item in data['Wage']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues2.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues2.append(int(item))
        else: 
            item = int(item)
            newvalues2.append(int(item))
    data['Wage']=newvalues2
    
    
    newvalues3 = []
    for item in data['Release Clause']:
        item = item.replace('€', '')
        if 'K' in item: 
            item = float(item.replace('K', ''))
            item *= 1000
            newvalues3.append(int(item))
        elif 'M' in item: 
            item = float(item.replace('M', ''))
            item *= 1000000      
            newvalues3.append(int(item))
        else: 
            item = int(item)
            newvalues3.append(int(item))
    data['Release Clause']=newvalues3
    
    
    # Clean up Hits column (which contains some values with K in them)¶
    newhits = []
    for item in data['Hits']:
        if 'K' in item:
            item = item.replace('K','')
            item = float(item) * 1000
            newhits.append(int(item))
        else: 
            newhits.append(int(item))
    data['Hits'] = newhits

    # Dropping stuff
    
    # Drop columns with irrelevant attributes
    #Some atributes are used to calculate the rating per category, that we already have, so individual attributes are not necessary here

    # Different versions for commenting out and experimenting
    
#     # First, a version with only the summed up columns (Attacking etc)
#     data = data.drop(['Nationality','Club','Team & Contract', 'Position', 'ID','Growth','Joined', 'Contract','Name','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
#            'Dribbling','Curve','FK Accuracy','Long Passing','Ball Control',
#            'Acceleration','Sprint Speed','Agility','Reactions','Balance',
#           'Shot Power','Jumping','Stamina','Strength','Long Shots',
#           'Aggression','Interceptions','Positioning','Vision','Penalties','Composure',
#           'Marking','Standing Tackle','Sliding Tackle',
#           'GK Diving','GK Handling','GK Kicking','GK Positioning','GK Reflexes'], axis=1)
    
    # Then, a version with only the component columns
    data = data.drop(['Nationality','Club','Team & Contract', 'Position', 'ID','Growth','Joined', 'Contract','Name','Crossing','Finishing','Heading Accuracy','Short Passing','Volleys',
'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Defending', 'Defending'], axis=1)
    
    
    
    #data = data.drop(['Age', 'Height', 'Weight', 'On Loan', 'Value', 'Wage', 'Release Clause'], axis=1)
    #data = data.drop(['Value', 'Wage', 'Release Clause'], axis=1)
    #data = data.drop(['Age', 'Height', 'Weight'], axis=1)

    for column in data.columns: 
        data = data[data[column].isna()==False]

    #Drop the rows with less than 5% of NaN
#     data = data[data['A/W'].isna()==False]
#     data = data[data['D/W'].isna()==False]

    return data



In [655]:
df = preprocess(pd.read_csv('fifa21_train.csv'))

### And a function to do the X-Y split

In [656]:
y = df['OVA']
X = df.drop(['OVA'], axis = 1)

#Split numerical and categorical data
X_num = X.select_dtypes(np.number)
X_cat = X.select_dtypes(object)


transformer = MinMaxScaler().fit(X_num)
# encoder = OneHotEncoder().fit(X_cat)


def xysplit(dataframe): 
    
    #Initial split (y will remain unchanged, X will undergo more operations)
    y = dataframe['OVA']
    X = dataframe.drop(['OVA'], axis = 1)
    
    #Split numerical and categorical data
    X_num = X.select_dtypes(np.number)
    X_cat = X.select_dtypes(object)

    #Normalize the numerical data
    x_normalized = transformer.transform(X_num)
    X_normal = pd.DataFrame(x_normalized, columns=X_num.columns)

    #Encode categorical data
    encoder = OneHotEncoder().fit(X_cat)

    encoded = encoder.transform(X_cat).toarray()

    # And get relevant headers for the encoded categorical data
    headers = []

    for category in encoder.categories_:
        for unit in category: 
            headers.append(unit)

    categ_encoded=pd.DataFrame(encoded, columns=headers)
    
    #Finally, concatenate the (normalized) numerical and (encoded) categorical data
    X = pd.concat([X_normal, categ_encoded], axis = 1)
    
    
    return X, y

In [657]:
X, y = xysplit(df)

### Test train split

In [658]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

### Build the model 

In [659]:
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression()

### A function to generate scores to evaluate

In [660]:
def scores(y_data, predictions):
    print("r2 score:",r2_score(y_data, predictions))
    print("MSE score:",mean_squared_error(y_data, predictions))
    print("RMSE score:",np.sqrt(mean_squared_error(y_data, predictions)))
    print("MAE score:", mean_absolute_error(y_data, predictions))

### Get the R2 score for the training and test data

In [661]:
predictions = lm.predict(X_train)
predictions_test = lm.predict(X_test)

print("Training data")
scores(y_train, predictions)

print("Test data")
scores(y_test, predictions_test)

Training data
r2 score: 0.9204087603462355
MSE score: 3.702127630892997
RMSE score: 1.9240913779997553
MAE score: 1.4901430460456113
Test data
r2 score: 0.9229505531657108
MSE score: 3.7098033095345193
RMSE score: 1.9260849694482638
MAE score: 1.5127089886487965


### Validate new data

In [662]:
df2 = preprocess(pd.read_csv('fifa21_validate.csv'))

### y2 here gives a meaningless value, but we'll ignore that and keep working with the original y


In [663]:
X2, y2 = xysplit(df2)

In [664]:
predictions2 = lm.predict(X2)

In [665]:
print(predictions2)

[65.90234375 65.79785156 53.73144531 ... 73.96435547 64.20751953
 61.1015625 ]


In [666]:
scores(y2, predictions2)

r2 score: 0.9185752612772627
MSE score: 3.683665013545718
RMSE score: 1.9192876317909513
MAE score: 1.4915917417585942


In [667]:
# y = df['ova']
# y2 = df2['ova']

# X = X.drop(['total claim amount'], axis=1)


### Training the model

### Predictions

In [668]:
predictions2 = lm.predict(X_train)
r2_score(y_train, predictions2)

0.9204087603462355

### Looking for the best player of the league

In [669]:
# df = df.drop(['id', 'name'], axis=1)
# identity = df.loc[:,['id','name']]
# X_ID = pd.concat([identity, X], axis=1)

In [670]:
id_max = np.argmax(lm.predict(X_test)) # find id of the maximum predicted label
print(X_ID.loc[id_max])

id                    203067
name                D. Chará
attacking           0.724051
crossing            0.659091
finishing           0.608696
heading accuracy    0.738636
short passing       0.638554
volleys              0.77907
skill               0.655738
dribbling            0.67033
curve                    0.6
fk accuracy         0.674157
long passing        0.690476
ball control        0.681319
movement            0.495726
acceleration        0.416667
sprint speed        0.458824
agility             0.487805
reactions            0.56338
balance                0.675
power               0.797428
shot power           0.86747
jumping             0.785714
stamina             0.694118
strength            0.753247
long shots          0.711111
mentality           0.857939
aggression           0.91954
interceptions       0.776471
positioning         0.731183
vision                   0.6
penalties           0.761905
composure           0.738095
defending           0.728745
marking       

In [671]:
id_max = np.argmax(lm.predict(X_test)) # find id of the maximum predicted label
print(df2.loc[id_max])

Age                      31
BP                       CB
Height               190.62
Weight             83.91545
foot                  Right
On Loan                   0
Value               7500000
Wage                  38000
Release Clause     12800000
Dribbling                55
Curve                  70.0
FK Accuracy              74
Long Passing             75
Ball Control             60
Acceleration             50
Sprint Speed             55
Agility                34.0
Reactions                66
Balance                42.0
Shot Power               85
Jumping                71.0
Stamina                  72
Strength                 90
Long Shots               67
Aggression               85
Interceptions          79.0
Positioning            44.0
Vision                 52.0
Penalties                52
Composure              71.0
Marking                  75
Standing Tackle          82
Sliding Tackle         76.0
Goalkeeping              56
GK Diving                15
GK Handling         