In [None]:
"""In the FIFA 21 Project we will try to predict a player's "Overall Rating" by analysing data from the following dataset: fifa21_train.csv
It can be found in the GDrive.

The database is an extract from https://sofifa.com/

Explanations of the acronyms and abbreviations can be found here and here"""

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('fifa21_train.csv')
df

In [None]:
column_names = df.columns.tolist()
column_names

In [None]:
fifa_df = df.copy()
fifa_df

In [None]:
#columns_to_retain = [
    #'BP', 'Position',
    #'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Composure',
     #'Defending','Goalkeeping', 'OVA','LS', 'ST', 'RS', 'LW', 'LF', 'CF',
     #'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB',
     #'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK',]

columns_to_retain = [
    'BP', 'Position',
    'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Composure',
     'Defending','Goalkeeping', 'OVA']

        
# Now you can use this list to filter your dataframe:
fifa_df_filtered = fifa_df[columns_to_retain]
fifa_df_filtered

In [None]:
check_duplicates = fifa_df_filtered.duplicated()
check_duplicates

In [None]:
num_duplicates = check_duplicates.sum()
num_duplicates

In [None]:
column_rename = {'BP': 'Best Position', 'OVA':'overall value'}
fifa_df_filtered = fifa_df_filtered.rename(columns=column_rename)
fifa_df_filtered

In [None]:
cols = []
for i in range(len(fifa_df_filtered.columns)):
    cols.append(fifa_df_filtered.columns[i].lower().replace(' ', '_'))
fifa_df_filtered.columns = cols
fifa_df_filtered

In [None]:
#def convert_position_rating(value):
    # if it is a full number it remains as it is
    #if isinstance(value, int):
        #return value
    
    # If the value contaisn '+', we split it and add it
    #if '+' in value:
        #base, added = value.split('+')
        #return int(base) + int(added)
    # If there is no '+', we just convert the value to full number
    #else:
        #return int(value)

# Columns to apply it
#columns_to_process = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam',
       #'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm',
       #'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']

# Applying the function to each column
#for col in columns_to_process:
    #fifa_df_filtered[col] = fifa_df_filtered[col].apply(convert_position_rating)
#fifa_df_filtered

In [None]:
#Check null values
fifa_df_filtered.isna().sum()

In [None]:
# Separating numerical data and categorical data
cat_df = fifa_df_filtered.select_dtypes(include= 'object')
num_df = fifa_df_filtered.select_dtypes(include= np.number)


In [None]:
#Applying median for numerical and mode for categorical values to replace null values
for col in fifa_df_filtered.columns:
        if fifa_df_filtered[col].dtype == 'object':
            fifa_df_filtered[col].fillna(fifa_df_filtered[col].mode()[0], inplace=True)
        else:
            fifa_df_filtered[col].fillna(fifa_df_filtered[col].median(), inplace=True)

In [None]:
# Checking the replacement of the null values
null_counts = fifa_df_filtered.isnull().sum()
null_counts.sum()


In [None]:
# Checking the statistics
fifa_df_filtered.describe().T

In [None]:
# Plotting the numerical columns
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
for column in num_df:
    sns.displot(num_df[column],kde=True)
    plt.show()

In [None]:
# Overall value shows a normalized shapped bell

In [None]:
# Getting correlations descritpion
correlations = num_df.corr()
correlations

In [None]:
fig, ax=plt.subplots(figsize=(12,15))
ax=sns.heatmap(num_df.corr(), annot=True)
plt.show()

In [None]:
# There seems to be some Mulicolliniarity

In [None]:
fifa_new = fifa_df_filtered
fifa_new

In [None]:
# X,y split and target definition

X=fifa_new.drop('overall_value', axis=1)     
y=fifa_new['overall_value']


In [None]:
# Categorical and numerical split of X
X_cat = X.select_dtypes(include= 'object')
X_num = X.select_dtypes(include= np.number)

In [None]:
X_num

In [None]:
#Getting the stats
X_num.describe().T

In [None]:
# importing the library
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
#Scalling using MinMax
transformer = StandardScaler().fit(X_num)
X_standard = transformer.transform(X_num)
print(X_standard.shape)
X_num_standard = pd.DataFrame(X_standard, columns=X_num.columns)
X_num_standard.head()

In [None]:
# One hot encoded
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(drop='first').fit(X_cat)

cols = encoder.get_feature_names_out(input_features=X_cat.columns)

X_cat_encode = pd.DataFrame(encoder.transform(X_cat).toarray(),columns=cols)

X_cat_encode.head()

In [None]:
# concat numerical and categorical transformations for features
X=pd.concat([X_num_standard, X_cat_encode], axis=1) 
X.describe()

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split as tts

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train.head()

In [None]:
# Train and run model
from sklearn.linear_model import LinearRegression as linreg

lm = linreg()    # model
lm.fit(X_train, y_train)   # model training
y_pred = lm.predict(X_test)   # model prediction

In [None]:
# Model Validation

In [None]:
print ('train R2: {} -- test R2: {}'.format(lm.score(X_train, y_train),
                                            lm.score(X_test, y_test)))

In [None]:
# MSE
from sklearn.metrics import mean_squared_error as mse

train_mse=mse(lm.predict(X_train), y_train)
test_mse=mse(y_pred, y_test)

print ('train MSE: {} -- test MSE: {}'.format(train_mse, test_mse))

In [None]:
# RMSE
print ('train RMSE: {} -- test RMSE: {}'.format(train_mse**.5, test_mse**.5))

In [None]:
#MAE
from sklearn.metrics import mean_absolute_error as mae

train_mae=mae(lm.predict(X_train), y_train)
test_mae=mae(y_pred, y_test)

print ('train MAE: {} -- test MAE: {}'.format(train_mae, test_mae))

# BONUS

Try to create a new function called preprocess which"

takes a dataframe as input
call all the other group member functions and apply them to the starting dataframe
return a clean dataframe.

In [None]:
    # Your code here
    def clean_fifa(fifa_new):
        n_cols = ['BP', 'Position',
    'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Composure',
     'Defending','Goalkeeping', 'OVA']
        for i in range(len(fifa_new.columns)):
            n_cols.append(fifa_new.columns[i].lower().replace(' ', '_'))
        fifa_new.columns = n_cols
        fifa_new = df.rename(columns={'BP': 'Best Position', 'OVA':'overall value'})
        fifa_new = df.drop_duplicates()
        cat_df = df.select_dtypes(include= 'object')
        num_df = df.select_dtypes(include= np.number)
        for col in df.columns:
            if fifa_new[col].dtype == 'object':
                fifa_new[col].fillna(fifa_new[col].mode()[0], inplace=True)
            else:
                fifa_new[col].fillna(fifa_new[col].median(), inplace=True)
    fifa_new          