# Importing All the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
df = pd.read_csv("../input/kick-off/FIFA_train.csv")
df_2 = pd.read_csv('../input/kick-off/FIFA_test_player_data.csv')

# Exploring the dataset

In [2]:
df.head()

In [3]:
#shape of dataset
df.shape

In [4]:
#dropping the columns with more than 80% null values for train dataset
df = df.dropna(thresh=df.shape[0]*0.2,how='all',axis=1)

In [5]:
#dropping the columns with more than 80% null values for test dataset
df_2 = df_2.dropna(thresh=df_2.shape[0]*0.2,how='all',axis=1)

In [6]:
#dropping the unneccesary columns for train dataset
df = df.drop(['player_url', 'short_name', 'long_name', 'dob','work_rate','real_face','joined','contract_valid_until','team_jersey_number','player_traits' ], axis = 1)

In [7]:
#dropping the unneccesary columns for test dataset
df_2 = df_2.drop(['player_url', 'short_name', 'long_name', 'dob','work_rate','real_face','joined','contract_valid_until','team_jersey_number','player_traits' ], axis = 1)

In [8]:
#total number of missing values in columns 
df.isnull().sum()

In [9]:
#Checking the distribution of the column release_clause_eur
plt.figure(figsize=(20,10))
sns.distplot(df.release_clause_eur)

As, the distribution of the column release_clause_eur is highly left skewed we will use Machine Learning model at a later stage to predict and impute the null values of the column, that way the imputation will be more accurate.

In [11]:
#Checking distribution of different columns
fig, axs = plt.subplots(2, 3, figsize=(10, 7))

sns.histplot(data=df, x="pace", kde=True, color="skyblue", ax=axs[0, 0])
sns.histplot(data=df, x="shooting", kde=True, color="olive", ax=axs[0, 1])
sns.histplot(data=df, x="dribbling", kde=True, color="gold", ax=axs[0, 2])
sns.histplot(data=df, x="defending", kde=True, color="teal", ax=axs[1, 0])
sns.histplot(data=df, x="physic", kde=True, color="teal", ax=axs[1, 1])
sns.histplot(data=df, x="passing", kde=True, color="green", ax=axs[1, 2])

In [12]:
#fill_columns = ['pace','shooting','dribbling','physic','passing']
#df[fill_columns] = df[fill_columns].fillna(df[fill_columns].mean())

In [13]:
#Checking the number of missing values in per columns again
df.isna().sum()

# Missing Value Imputation

In [14]:
#Filling the columns with missing values with random values within 2 standard devaitions from mean for train dataset
fill_columns = ['pace','shooting','dribbling','physic','passing','defending']
for columns in fill_columns:
    mean = df[columns].mean()
    std = df[columns].std()
    null = df[columns].isnull().sum()
    
    #computing random numbers between mean and std
    rand_val = np.random.randint(mean-(2*std), mean+(2*std), size = null)
    
    #fill missing values of column with random value generated
    df.loc[df[columns].isnull(),columns] = rand_val

In [15]:
#Filling the columns with missing values with random values within 2 standard devaitions from mean for test dataset
fill_columns = ['pace','shooting','dribbling','physic','passing','defending']
for columns in fill_columns:
    mean = df_2[columns].mean()
    std = df_2[columns].std()
    null = df_2[columns].isnull().sum()
    
    #computing random numbers between mean and std
    rand_val = np.random.randint(mean-(2*std), mean+(2*std), size = null)
    
    #fill missing values of column with random value generated
    df_2.loc[df_2[columns].isnull(),columns] = rand_val

In [16]:
df.head()

In [17]:
df['pace'].isnull().sum()

In [18]:
#Filling the missing values in team_position column with mode for train dataset
df['team_position'].fillna(df['team_position'].mode()[0], inplace = True)

In [19]:
#Filling the missing values in team_position column with mode for train dataset
df_2['team_position'].fillna(df_2['team_position'].mode()[0], inplace = True)

In [20]:
#Converting the player position columns into working format for train dataset
pos_list =  ['ls', 'st', 'rs','lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam',
             'ram', 'lm', 'lcm', 'cm','rcm', 'rm', 'lwb', 'ldm', 'cdm', 
             'rdm', 'rwb', 'lb', 'lcb', 'cb',
               'rcb', 'rb']
int_list = []
for column in pos_list:
    for i in df.loc[df[column].notna(), column]:
        split_str = str(i).split("+",1)
        int_list.append(split_str[0])
    df.loc[df[column].notna(), column] = int_list
    int_list = []
#split_str = str(df['ls'][]).split("+",1)
#nt_list.append(split_str[0])
#df[columnn].loc[df[column].notna(), column] =

In [21]:
#Converting the player position columns into working format for test dataset
pos_list =  ['ls', 'st', 'rs','lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam',
             'ram', 'lm', 'lcm', 'cm','rcm', 'rm', 'lwb', 'ldm', 'cdm', 
             'rdm', 'rwb', 'lb', 'lcb', 'cb',
               'rcb', 'rb']
int_list = []
for column in pos_list:
    for i in df_2.loc[df_2[column].notna(), column]:
        split_str = str(i).split("+",1)
        int_list.append(split_str[0])
    df_2.loc[df_2[column].notna(), column] = int_list
    int_list = []

In [23]:
#Converting the player position columns into integer format for train dataset
for column in pos_list:
    
    df.loc[df[column].notnull(), column] = df.loc[df[column].notnull(), column].apply(int)

In [24]:
#Converting the player position columns into integer format for test dataset
for column in pos_list:
    
    df_2.loc[df_2[column].notnull(), column] = df_2.loc[df_2[column].notnull(), column].apply(float)

In [25]:
#Plotting all the player position column's distributions
for i in range(len(pos_list)):
    plt.figure(figsize=(24,24))
    plt.subplot(7,4,i+1)
    sns.histplot(df[pos_list[i]], kde = True)

In [26]:
fill = ['rcb','cb','lcb']
for x in fill:
    pos_list.remove(x)

In [27]:
#Imputing the missing values of the columns in the list pos_list with mean for train dataset
for column in pos_list:
    
    df[column].fillna(df[column].mean(), inplace = True)

In [28]:
#Imputing the missing values of the columns in the list pos_list with mean for test dataset
for column in pos_list:
    
    df_2[column].fillna(df_2[column].mean(), inplace = True)

In [29]:
#Imputing the missing values of the columns in the list fill with random values within 2 standard deviations from the mean for train dataset
for columns in fill:
    mean = df[columns].mean()
    std = df[columns].std()
    null = df[columns].isnull().sum()
    
    #computing random numbers between mean and std
    rand_val = np.random.randint(mean-(2*std), mean+(2*std), size = null)
    
    #fill missing values of column with random value generated
    df.loc[df[columns].isnull(),columns] = rand_val

In [30]:
#Imputing the missing values of the columns in the list fill with random values within 2 standard deviations from the mean for test dataset
for columns in fill:
    mean = df_2[columns].mean()
    std = df_2[columns].std()
    null = df_2[columns].isnull().sum()
    
    #computing random numbers between mean and std
    rand_val = np.random.randint(mean-(2*std), mean+(2*std), size = null)
    
    #fill missing values of column with random value generated
    df_2.loc[df_2[columns].isnull(),columns] = rand_val

# Encoding the categorical columns

In [31]:
#Encoding the categorical columns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
#selecting categorical columns to encode from train dataset
cat_columns_train = list(df.select_dtypes("object").columns)
#as the column FireplaceQu has NaN values so it cannot be imputed directly, so we are removing it from the list
cat_columns_train = [x for x in cat_columns_train if x!= "release_clause_eur"]
#setting up the imputer
transformer = ColumnTransformer(transformers=[("L",OrdinalEncoder(),cat_columns_train)],remainder='passthrough')
cat_columns_imputed_train = transformer.fit_transform(df[cat_columns_train])
#pasting the imputed values in the each of the categorical columns of the original train dataset
df[cat_columns_train] = cat_columns_imputed_train

In [32]:
pd.set_option('display.max_rows', 150)
df_2.isna().sum()

In [33]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
#selecting categorical columns to encode from train dataset
cat_columns_train = list(df_2.select_dtypes("object").columns)
#as the column FireplaceQu has NaN values so it cannot be imputed directly, so we are removing it from the list
cat_columns_train = [x for x in cat_columns_train if x!= "release_clause_eur"]
#setting up the imputer
transformer = ColumnTransformer(transformers=[("L",OrdinalEncoder(),cat_columns_train)],remainder='passthrough')
cat_columns_imputed_train = transformer.fit_transform(df_2[cat_columns_train])
#pasting the imputed values in the each of the categorical columns of the original train dataset
df_2[cat_columns_train] = cat_columns_imputed_train

In [34]:
#Checking the data types of each column
df.dtypes

In [35]:
#KNN imputing the FireplaceQu column of train dataset
#from fancyimpute import KNN
##KNN imputing the FireplaceQu column of train dataset
#imputer = KNN(k=5)
#release_clause_eur_imputed = imputer.fit_transform(np.array(df["release_clause_eur"]).reshape(-1,1))
#df["release_clause_eur"] = release_clause_eur_imputed

In [36]:
#df.drop("release_clause_eur", axis = 1, inplace = True)

In [37]:
#df.drop("value", axis = 1, inplace = True)

# Imputing the release_clause_eur column values by predicting them with the help of Random forest regressor model

In [42]:
#Number of missing values in release_clause_eur column
df['release_clause_eur'].isnull().sum()

In [43]:
df_3 = df.loc[df['release_clause_eur'].notna()]

In [44]:
df_4 = df_2.loc[df_2['release_clause_eur'].notna()]

In [45]:
X = df_3.drop('release_clause_eur', axis = 1)
y = df_3['release_clause_eur']

In [46]:
X_2 = df_4.drop('release_clause_eur', axis = 1)
y_2 = df_4['release_clause_eur']

In [47]:
#Preparing dataset to fit the model on train dataset
X_pred = df.loc[df['release_clause_eur'].isnull()].drop('release_clause_eur', axis = 1)

In [48]:
#Preparing dataset to fit the model on test dataset
X_pred_2 = df_2.loc[df_2['release_clause_eur'].isnull()].drop('release_clause_eur', axis = 1)

In [49]:
#model for train dataset
model = RandomForestRegressor()

In [50]:
#model for test dataset
model_2 = RandomForestRegressor()

In [51]:
model.fit(X,y)

In [52]:
model_2.fit(X_2,y_2)

In [53]:
y_pred = model.predict(X_pred)

In [54]:
y_pred_2 = model_2.predict(X_pred_2)

In [55]:
#Imputing the missing values in release_clause_eur for train dataset
df.loc[df['release_clause_eur'].isnull() , 'release_clause_eur'] = y_pred

In [56]:
#Imputing the missing values in release_clause_eur for test dataset
df_2.loc[df_2['release_clause_eur'].isnull() , 'release_clause_eur'] = y_pred_2

# Feature Selection

In [57]:
#Preparing the dataset to build the model
X = df.drop("value", axis = 1)
y = df["value"]

In [58]:
#Splitting the dataset into train and test datasets
X_train, X_test, y_train , y_test = train_test_split(X,y)

In [59]:
#params = {'max_depth':[5,10,20,50,80],'eta' : [0.1,0.05,0.3,0.03]}
#random_search_cv = RandomizedSearchCV(XGBRegressor(), params, verbose=2, cv=3)

In [60]:
#random_search_cv.fit(X_train, y_train)

In [61]:
#random_search_cv.best_estimator_

In [62]:
#Calling the RandomForestRegressor model and fitting it in the dataset
rf = RandomForestRegressor(n_estimators=150)
rf.fit(X, y)

In [63]:
#Calculating the feature importances according the RandomForestRegressor model
sort = rf.feature_importances_.argsort()
rf.feature_importances_[sort]

In [64]:
#Printing the values of feature importances
print("Feature Importances :")
for feature in zip(X.columns, rf.feature_importances_*10000):
    print(feature)

In [65]:
#Selecting features with more than 1/10000 feature score
sfm = SelectFromModel(rf,threshold = 1/10000)

In [66]:
sfm.fit(X, y)

In [67]:
#printing the top 20 features according to the RandomForestRegressor model
print("Top 20 features :")
imp_features = []
for feature_list_index in sfm.get_support(indices=True):
    print(X.columns[feature_list_index])
    imp_features.append(X.columns[feature_list_index])

In [68]:
X_important_train = sfm.transform(X)

# Building and fitting the final model

In [69]:
model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eta=0.1, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.100000001,
             max_delta_step=0, max_depth=80, min_child_weight=1,
             monotone_constraints='()', n_estimators=100, n_jobs=-1,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [70]:
model.fit(X_important_train,y)

In [71]:
y_pred = model.predict(df_2[imp_features])

In [72]:
#mae_1 = mean_absolute_error(y_test,y_pred)
#rmse_1 = np.sqrt(mean_squared_error(y_test,y_pred))
#r_squared_1 = r2_score(y_test,y_pred)
#print("Root mean square error : ",rmse_1)
#print("Mean absolute error : ",mae_1)
#print("r squared : ",r_squared_1)

In [73]:
df_2.shape

In [74]:
df_2["value"] = y_pred

In [75]:
df_2.value

# Saving the result in required format

In [76]:
final_df = pd.read_csv('../input/kick-off/FIFA_test.csv')

In [77]:
final_df.count()

In [78]:
cleaned_ids = []
for i in final_df['player_ids']:
    a = i.split(',')
    a[0] = a[0].replace("[","")
    a[-1] = a[-1].replace("]","")
    for i in range(0,len(a)):
        a[i] = a[i].replace(" ","")
    cleaned_ids.append(a)

In [79]:
df_5 = df_2.groupby('sofifa_id', as_index=False)['value'].max()

In [80]:
zipped = list(zip(df_5.sofifa_id,df_5.value))

In [81]:
for i in zipped:
    if(i[0] == 238577):
        print(i)

In [82]:
numeric_ids = []
for row in cleaned_ids:
    new_numbers = []
    for n in row:
        new_numbers.append(int(n))
    numeric_ids.append(new_numbers)

In [83]:
#ids = df_2["sofifa_id"]
#df_2[ids.isin(ids[ids.duplicated()])]

In [84]:
container = []
a = []
for i in range(0,len(numeric_ids)):
    for k in range(0,11):
        for j in range(len(df_5)):
            if(numeric_ids[i][k] == zipped[j][0]):
                    a.append(zipped[j][1])
    
    container.append(a)
    a = []

In [85]:
final_outcome = []
for i in range(len(numeric_ids)):
    max_idx = container[i].index(max(container[i]))
    final_outcome.append(numeric_ids[i][max_idx])

In [86]:
final_df.player_ids.shape

# Saving Final Dataset

In [87]:
final_df["most_valued_player_id"] = final_outcome

In [88]:
final_df.to_csv("submit_3.csv", index = False)