In [None]:
"""In the FIFA 21 Project we will try to predict a player's "Overall Rating" by analysing data from the following dataset: fifa21_train.csv
It can be found in the GDrive.

The database is an extract from https://sofifa.com/

Explanations of the acronyms and abbreviations can be found here and here"""

In [None]:
import pandas as pd
import numpy as np

In [None]:
# opening the data file
df = pd.read_csv('fifa21_train.csv')
df

In [None]:
#Checking for the full column contents
column_names = df.columns.tolist()
column_names

In [None]:
# Making a copy of the original file
fifa_df = df.copy()
fifa_df

In [None]:
#columns_to_retain = [
    #'BP', 'Position',
    #'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Composure',
     #'Defending','Goalkeeping', 'OVA','LS', 'ST', 'RS', 'LW', 'LF', 'CF',
     #'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB',
     #'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK','OVA']

columns_to_retain = [
    'BP','Attacking','Crossing','Short Passing','Skill','Ball Control','FK Accuracy','Movement','Sprint Speed',
    'Power','Shot Power', 'Strength', 'Mentality','Composure','Vision','Defending','Standing Tackle','Sliding Tackle','Goalkeeping',
    'GK Reflexes','GK Handling','OVA']

        
# Get the dataframe filtered:
fifa_df_filtered = fifa_df[columns_to_retain]
fifa_df_filtered

In [None]:
# Check for duplicate values
check_duplicates = fifa_df_filtered.duplicated()
check_duplicates

In [None]:
# How many duplicate values
num_duplicates = check_duplicates.sum()
num_duplicates

In [None]:
#Renaming the columns
column_rename = {'BP': 'Best Position','OVA':'overall value'}
fifa_df_filtered = fifa_df_filtered.rename(columns=column_rename)
fifa_df_filtered

In [None]:
# Replacing white spaces for '_' and get all column titles in lowercase
cols = []
for i in range(len(fifa_df_filtered.columns)):
    cols.append(fifa_df_filtered.columns[i].lower().replace(' ', '_'))
fifa_df_filtered.columns = cols
fifa_df_filtered

In [None]:
#def convert_position_rating(value):
    # If it is a full number, it remains as it is
    #if isinstance(value, int):
        #return value
    
    # If the value contains '+', we split it and add it
    #if '+' in value:
       # base, added = value.split('+')
        #return int(base) + int(added)
    # If there is no '+', we just convert the value to a full number
    #else:
        #return int(value)

# Columns to apply it
#columns_to_process = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam',
       #'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm',
       #'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb','gk']

# Applying the function to each column
#for col in columns_to_process:
    #fifa_df_filtered[col] = fifa_df_filtered[col].apply(convert_position_rating)
    

In [None]:
#Check null values
fifa_df_filtered.isna().sum()

In [None]:
# Separating numerical data and categorical data
cat_df = fifa_df_filtered.select_dtypes(include= 'object')
num_df = fifa_df_filtered.select_dtypes(include= np.number)


In [None]:
#Applying median for numerical and mode for categorical values to replace null values
for col in fifa_df_filtered.columns:
        if fifa_df_filtered[col].dtype == 'object':
            fifa_df_filtered[col].fillna(fifa_df_filtered[col].mode()[0], inplace=True)
        else:
            fifa_df_filtered[col].fillna(fifa_df_filtered[col].median(), inplace=True)

In [None]:
# Checking the replacement of the null values
null_counts = fifa_df_filtered.isnull().sum()
null_counts.sum()


In [None]:
# Checking the statistics
fifa_df_filtered.describe().T

In [None]:
# Plotting the numerical columns
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
for column in num_df:
    sns.displot(num_df[column],kde=True)
    plt.show()

In [None]:
# Overall value shows a normalized shapped bell

In [None]:
# Getting correlations descritpion
correlations = num_df.corr()
correlations

In [None]:
# Getting the correlations map
fig, ax=plt.subplots(figsize=(15,10))
ax=sns.heatmap(num_df.corr(), annot=True)
plt.show()

In [None]:
# There seems to be some Mulicolliniarity regarding some features

In [None]:
fifa_new = fifa_df_filtered
fifa_new

In [None]:
# X,y split and target definition

X=fifa_new.drop('overall_value', axis=1)     
y=fifa_new['overall_value']


In [None]:
# Categorical and numerical split of X
X_cat = X.select_dtypes(include= 'object')
X_num = X.select_dtypes(include= np.number)

In [None]:
X_num

In [None]:
#Getting the stats
X_num.describe().T

In [None]:
# importing the library
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
#Scalling using MinMaxScaler
transformer = MinMaxScaler().fit(X_num)
X_norm = transformer.transform(X_num)
print(X_norm.shape)
X_num_scale = pd.DataFrame(X_norm, columns=X_num.columns)
X_num_scale.head()

In [None]:
# One hot encoded
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(drop='first').fit(X_cat)
cols = encoder.get_feature_names_out(input_features=X_cat.columns)
X_cat_encode = pd.DataFrame(encoder.transform(X_cat).toarray(),columns=cols)
X_cat_encode.head()

In [None]:
# concat numerical and categorical transformations for features
X=pd.concat([X_num_scale, X_cat_encode], axis=1) 
X.describe()

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train.head()

In [None]:
# Train and run model
from sklearn.linear_model import LinearRegression as linreg

lm = linreg()    # model
lm.fit(X_train, y_train)   # model training
y_pred = lm.predict(X_test)   # model prediction

In [None]:
# Model Validation

In [None]:
print ('train R2: {} -- test R2: {}'.format(lm.score(X_train, y_train),
                                            lm.score(X_test, y_test)))

In [None]:
# MSE
from sklearn.metrics import mean_squared_error as mse

train_mse=mse(lm.predict(X_train), y_train)
test_mse=mse(y_pred, y_test)

print ('train MSE: {} -- test MSE: {}'.format(train_mse, test_mse))

In [None]:
# RMSE
print ('train RMSE: {} -- test RMSE: {}'.format(train_mse**.5, test_mse**.5))

In [None]:
    #MAE
    from sklearn.metrics import mean_absolute_error as mae

    train_mae=mae(lm.predict(X_train), y_train)
    test_mae=mae(y_pred, y_test)

    print ('train MAE: {} -- test MAE: {}'.format(train_mae, test_mae))

In [None]:
# If we increase the sample of data from 0.20 to 0.35 or decrease the sample from 0.20 for 0.10, the r2 score lowers from 
#0,84 to 0,83, which is not very significant in terms of efectiveness proof of the prediction model.

# BONUS

Try to create a new function called preprocess which"

takes a dataframe as input
call all the other group member functions and apply them to the starting dataframe
return a clean dataframe.

In [None]:
    # Your code here
    def clean_fifa(fifa_new):
        n_cols = ['BP','Attacking','Crossing','Short Passing','Skill','Ball Control','FK Accuracy','Movement','Sprint Speed',
    'Power','Shot Power', 'Strength', 'Mentality','Composure','Vision','Defending','Standing Tackle','Sliding Tackle','Goalkeeping',
    'GK Reflexes','GK Handling','OVA']
        for i in range(len(fifa_new.columns)):
            n_cols.append(fifa_new.columns[i].lower().replace(' ', '_'))
        fifa_new.columns = n_cols
        fifa_new = df.rename(columns={'BP': 'Best Position', 'OVA':'overall value'})
        #Check null values
        fifa_new = df.drop_duplicates()
        #Check null values
        fifa_new = fifa_df_filtered.isna().sum()
        cat_df = fifa_new.select_dtypes(include= 'object')
        num_df = fifa_new.select_dtypes(include= np.number)
        for col in df.columns:
            if fifa_new[col].dtype == 'object':
                fifa_new[col].fillna(fifa_new[col].mode()[0], inplace=True)
            else:
                fifa_new[col].fillna(fifa_new[col].median(), inplace=True)
    fifa_new          

# FIFA MODEL VALIDATION

In [None]:
""""@canal It’s TIME!!!  The Validation Set is in the Google Drive!!  
Remember the process!!  
You are receiving new and unseen data to test the model you created.  
The actual target column OVA is included in the data.  
First do the X,y split.  
Then clean, scale, and encode the X exactly like you did the training set. 
Make sure all of the columns are the same in the same order.  
Re-use the scaler, encoder, and model that has already been trained. 
You should submit your RMSE metric by 12:30.  
We will ask the group with the best score to share their process with the whole group."""

In [None]:
import pandas as pd
import numpy as np

In [None]:
fifa_validation = pd.read_csv('fifa21_validate.csv')
fifa_validation

In [None]:
columns_to_retain = [
    'BP','Attacking','Crossing','Short Passing','Skill','Ball Control','FK Accuracy','Movement','Sprint Speed',
    'Power','Shot Power', 'Strength', 'Mentality','Composure','Vision','Defending','Standing Tackle','Sliding Tackle','Goalkeeping',
    'GK Reflexes','GK Handling','OVA']

        
# Now you can use this list to filter your dataframe:
fifa_val_filtered = fifa_validation[columns_to_retain]
fifa_val_filtered

In [None]:
check_duplicates = fifa_val_filtered.duplicated()
check_duplicates

In [None]:
num_duplicates = check_duplicates.sum()
num_duplicates

In [None]:
column_rename = {'BP': 'Best Position','OVA':'overall value'}
fifa_val_filtered = fifa_val_filtered.rename(columns=column_rename)
fifa_val_filtered

In [None]:
cols = []
for i in range(len(fifa_val_filtered.columns)):
    cols.append(fifa_val_filtered.columns[i].lower().replace(' ', '_'))
fifa_val_filtered.columns = cols
fifa_val_filtered

In [None]:
fifa_val_filtered.isna().sum()

In [None]:
# Separating numerical data and categorical data
cat_df = fifa_val_filtered.select_dtypes(include= 'object')
num_df = fifa_val_filtered.select_dtypes(include= np.number)
num_df

In [None]:
#Applying median for numerical and mode for categorical values to replace null values
for col in fifa_val_filtered.columns:
        if fifa_val_filtered[col].dtype == 'object':
            fifa_val_filtered[col].fillna(fifa_val_filtered[col].mode()[0], inplace=True)
        else:
            fifa_val_filtered[col].fillna(fifa_val_filtered[col].median(), inplace=True)

In [None]:
# Checking the replacement of the null values
null_counts = fifa_val_filtered.isnull().sum()
null_counts.sum()

In [None]:
# X,y split and target definition

X_validate =fifa_val_filtered.drop('overall_value', axis=1)     
y_validate =fifa_val_filtered['overall_value']

In [None]:
# Splitting again into numerical and categorical values
X_num_validate=X_validate.select_dtypes(exclude=["object","category"])
X_cat_validate=X_validate.select_dtypes(include=["object","category"])

print (X_validate.shape, X_num_validate.shape, X_cat_validate.shape, y_validate.shape) 

In [None]:
#Onehot encoding
encoded_validate = encoder.transform(X_cat_validate).toarray()
cols = encoder.get_feature_names_out(input_features=X_cat_validate.columns)
onehot_encoded_validate = pd.DataFrame(encoded_validate, columns=cols)

In [None]:
# Normalizing data with MinMaxScaler
x_normalized_validate = transformer.transform(X_num_validate)
print(x_normalized_validate.shape)
normalized_df_validate = pd.DataFrame(x_normalized_validate, columns = X_num_validate.columns)
normalized_df_validate 

In [None]:
# Getting both dataframes merged together (Normalized (numerical) and OneHot Encoded(categorical))
X_validate = pd.concat([normalized_df_validate, onehot_encoded_validate], axis=1)
X_validate

In [None]:
#Predicting validation X:
results_for_validate = lm.predict(X_validate)
comparision= pd.concat([fifa_val_filtered,pd.Series(results_for_validate, name='estimate')],axis=1).head()
comparision

In [None]:
results_for_validate

In [None]:
from sklearn.metrics import r2_score

# Calculate the R-squared score for validation predictions
r2 = r2_score(y_validate, results_for_validate)
print("R-squared (R2) Score:", r2)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import math

#Mean Absolute Error
mae = mean_absolute_error(y_validate, results_for_validate)
print (mae)


In [None]:
# Mean Squared Error
mse= mean_squared_error (y_validate, results_for_validate)
print (mse)

In [None]:
# Root Mean Squared Error
rmse = math.sqrt(mse)
print(rmse)

In [None]:
# The obtained results of R2 score of 0,83 and the rmse of 2,75 shows that this is a quite good predictive model.