In [83]:
#importing relevant libraries 
import pandas as pd
import numpy as np 
import pickle as pkl
import category_encoders as ce

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#importing models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

#evaluation metrics 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#Fine tuninng the model 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score

# Importing streamlit
import streamlit as st



### Data Preprocessing 


In [4]:
#Reading the data
fifaDF = pd.read_csv('male_players (legacy).csv', low_memory = False)

In [6]:
fifaDF.head()

Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,158023,/player/158023/lionel-messi/150002,15,2,2014-09-18,L. Messi,Lionel Andrés Messi Cuccittini,CF,93,95,...,62+3,62+3,62+3,54+3,45+3,45+3,45+3,54+3,15+3,https://cdn.sofifa.net/players/158/023/15_120.png
1,20801,/player/20801/c-ronaldo-dos-santos-aveiro/150002,15,2,2014-09-18,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"LW, LM",92,92,...,63+3,63+3,63+3,57+3,52+3,52+3,52+3,57+3,16+3,https://cdn.sofifa.net/players/020/801/15_120.png
2,9014,/player/9014/arjen-robben/150002,15,2,2014-09-18,A. Robben,Arjen Robben,"RM, LM, RW",90,90,...,64+3,64+3,64+3,55+3,46+3,46+3,46+3,55+3,14+3,https://cdn.sofifa.net/players/009/014/15_120.png
3,41236,/player/41236/zlatan-ibrahimovic/150002,15,2,2014-09-18,Z. Ibrahimović,Zlatan Ibrahimović,ST,90,90,...,65+3,65+3,61+3,56+3,55+3,55+3,55+3,56+3,17+3,https://cdn.sofifa.net/players/041/236/15_120.png
4,167495,/player/167495/manuel-neuer/150002,15,2,2014-09-18,M. Neuer,Manuel Peter Neuer,GK,90,90,...,40+3,40+3,36+3,36+3,38+3,38+3,38+3,36+3,87+3,https://cdn.sofifa.net/players/167/495/15_120.png


In [74]:
#Selecting the columns that have less than 30% null values 
greater_than = []
less_than = []
for i in fifaDF.columns:
    if((fifaDF[i].isnull().sum())< (0.3 * (fifaDF.shape[0]))):
        greater_than.append(i)
    else:
        less_than.append(i)

In [75]:
#Reassigning the data frame to the new dataframe 
fifaDF = fifaDF[greater_than]
fifaDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161583 entries, 0 to 161582
Columns: 102 entries, player_id to player_face_url
dtypes: float64(14), int64(45), object(43)
memory usage: 125.7+ MB


In [169]:
#seperating the numeric and non numeric features 
numeric_data = fifaDF.select_dtypes(include = np.number)
non_numeric = fifaDF.select_dtypes(include = ['object'])

##### Dealing with numeric data

In [170]:
#correlation between features listed and overall.
corr_matrix = numeric_data.corr()

In [171]:
corr_matrix = corr_matrix["overall"].sort_values(ascending=False)

In [172]:
# Identify features with correlation less than 0.4 with the target
high_corr_features = corr_matrix[abs(corr_matrix) > 0.4 ].index

In [173]:
low_corr_features = corr_matrix[abs(corr_matrix) < 0.4 ]
print(f'The low correlation features are:\n{low_corr_features}')

The columns to drop are:
mentality_aggression              0.398310
attacking_crossing                0.396833
skill_fk_accuracy                 0.387269
attacking_volleys                 0.377048
skill_dribbling                   0.369827
power_stamina                     0.360025
power_strength                    0.350786
mentality_positioning             0.350178
attacking_heading_accuracy        0.340029
mentality_penalties               0.334488
skill_moves                       0.330951
attacking_finishing               0.326376
defending                         0.325151
mentality_interceptions           0.314282
power_jumping                     0.275578
defending_marking_awareness       0.274173
movement_agility                  0.263671
defending_standing_tackle         0.250624
defending_sliding_tackle          0.220398
weak_foot                         0.217422
movement_sprint_speed             0.208990
movement_acceleration             0.194625
pace                         

In [174]:
columns_to_drop = ['player_id','nationality_id','league_id','club_team_id','club_jersey_number','fifa_version','weight_kg','movement_balance','club_contract_valid_until_year','height_cm','fifa_update' ]

In [175]:
#droping the irrelevant columns from the numeric dataFrame
numeric_data.drop(columns = columns_to_drop, axis = 1 , inplace = True)

In [176]:
numeric_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161583 entries, 0 to 161582
Data columns (total 48 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   overall                      161583 non-null  int64  
 1   potential                    161583 non-null  int64  
 2   value_eur                    159530 non-null  float64
 3   wage_eur                     159822 non-null  float64
 4   age                          161583 non-null  int64  
 5   league_level                 158866 non-null  float64
 6   weak_foot                    161583 non-null  int64  
 7   skill_moves                  161583 non-null  int64  
 8   international_reputation     161583 non-null  int64  
 9   pace                         143614 non-null  float64
 10  shooting                     143614 non-null  float64
 11  passing                      143614 non-null  float64
 12  dribbling                    143614 non-null  float64
 13 

In [178]:
#multivariate imputation 
imp = IterativeImputer(max_iter = 10, random_state = 0)
numeric_data = pd.DataFrame(np.round(imp.fit_transform(numeric_data)), columns = numeric_data.columns)#this line learns the data and imputes the missing features 

In [179]:
numeric_data

Unnamed: 0,overall,potential,value_eur,wage_eur,age,league_level,weak_foot,skill_moves,international_reputation,pace,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,93.0,95.0,100500000.0,550000.0,27.0,1.0,3.0,4.0,5.0,93.0,...,76.0,92.0,25.0,21.0,20.0,6.0,11.0,15.0,14.0,8.0
1,92.0,92.0,79000000.0,375000.0,29.0,1.0,4.0,5.0,5.0,93.0,...,85.0,89.0,22.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,90.0,90.0,54500000.0,275000.0,30.0,1.0,2.0,4.0,5.0,93.0,...,80.0,90.0,29.0,26.0,26.0,10.0,8.0,11.0,5.0,15.0
3,90.0,90.0,52500000.0,275000.0,32.0,1.0,4.0,4.0,5.0,76.0,...,91.0,90.0,25.0,41.0,27.0,13.0,15.0,10.0,9.0,12.0
4,90.0,90.0,63500000.0,300000.0,28.0,1.0,4.0,1.0,5.0,60.0,...,37.0,59.0,25.0,25.0,25.0,87.0,85.0,92.0,90.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161578,46.0,61.0,110000.0,700.0,18.0,1.0,3.0,2.0,1.0,63.0,...,54.0,40.0,23.0,21.0,25.0,9.0,13.0,13.0,12.0,7.0
161579,46.0,58.0,110000.0,750.0,19.0,1.0,3.0,2.0,1.0,55.0,...,31.0,35.0,50.0,51.0,45.0,6.0,14.0,8.0,13.0,14.0
161580,46.0,58.0,110000.0,500.0,19.0,1.0,2.0,2.0,1.0,65.0,...,37.0,35.0,36.0,45.0,50.0,8.0,9.0,7.0,14.0,9.0
161581,46.0,70.0,150000.0,500.0,17.0,1.0,3.0,2.0,1.0,55.0,...,63.0,43.0,19.0,17.0,14.0,13.0,12.0,14.0,7.0,13.0


##### Dealing with Non-numeric features 

In [200]:
non_numeric = fifaDF.select_dtypes(include = ['object'])
columns_to_drop = ['player_url','fifa_update_date','player_face_url','dob','short_name', 'long_name','league_name','club_name','club_position','club_joined_date','real_face','body_type']

In [201]:
#Dropping the irrelevant columns 
non_numeric.drop(columns = columns_to_drop, axis = 1 , inplace = True)

In [202]:
#using pipelines 
cat_pipe = Pipeline([
 ("impute", SimpleImputer(strategy="most_frequent")),
])

In [203]:
full_pipe = ColumnTransformer([
    ("cat", cat_pipe,make_column_selector(dtype_include = object))
])
piped = full_pipe.fit_transform(non_numeric)

In [208]:
non_numeric = pd.DataFrame(data = piped, columns = full_pipe.get_feature_names_out())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161583 entries, 0 to 161582
Data columns (total 31 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   cat__player_positions  161583 non-null  object
 1   cat__nationality_name  161583 non-null  object
 2   cat__preferred_foot    161583 non-null  object
 3   cat__work_rate         161583 non-null  object
 4   cat__ls                161583 non-null  object
 5   cat__st                161583 non-null  object
 6   cat__rs                161583 non-null  object
 7   cat__lw                161583 non-null  object
 8   cat__lf                161583 non-null  object
 9   cat__cf                161583 non-null  object
 10  cat__rf                161583 non-null  object
 11  cat__rw                161583 non-null  object
 12  cat__lam               161583 non-null  object
 13  cat__cam               161583 non-null  object
 14  cat__ram               161583 non-null  object
 15  

In [209]:
#using binary encoding 
encoder = ce.BinaryEncoder(cols = non_numeric.columns)

In [210]:
non_numeric = encoder.fit_transform(non_numeric)

In [213]:
non_numeric

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161583 entries, 0 to 161582
Columns: 268 entries, cat__player_positions_0 to cat__gk_8
dtypes: int64(268)
memory usage: 330.4 MB


In [218]:
fifaDF = pd.concat([numeric_data, non_numeric], axis= 1)

In [219]:
y = fifaDF['overall']

##### Scaling the data

In [223]:
X = fifaDF.drop('overall', axis = 1)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

##### Training and testing the model 

In [225]:
 Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size = 0.2,random_state = 42, stratify = y)

In [226]:
#Linear Regression
lr = LinearRegression()
lr.fit(Xtrain, Ytrain)

#Random Forest Regression 
rf = RandomForestRegressor()
rf.fit(Xtrain, Ytrain)

#Decision Tree Regression 
dt = DecisionTreeRegressor()
dt.fit(Xtrain, Ytrain)


In [None]:
 for model in [lr, rf, dt]:
    pkl.dump(model,open( model.__class__.__name__ +'V2.actual' +'.pkl','wb'))

In [37]:
#Defining a function for evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, mae, r2


##### Fine-tuning the model

In [19]:
# Perform GridSearchCV on Descision tree
grid_search = GridSearchCV(estimator = dt, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(Xtrain, Ytrain)

In [262]:
# Get the best parameters and model
#best_params = grid_search.best_params_
#best_model = grid_search.best_estimator_

In [None]:
# Measure performance of the best model
best_predictions = best_model.predict(Xtest)
best_mse = mean_squared_error(Ytest, best_predictions)
print(f'Best Mean Squared Error: {best_mse}')

##### Cross-Validation 

In [283]:
# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [284]:
# Perform cross-validation
cv_scores = cross_val_score(rf, Xtrain, Ytrain, cv=kf, scoring='neg_mean_squared_error')

In [285]:
# Print the cross-validation scores and their mean
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean Cross-Validation Score: {np.mean(cv_scores)}")

Cross-Validation Scores: [-0.45047575 -0.44656541 -0.43303641 -0.44243415 -0.44517702]
Mean Cross-Validation Score: -0.4435377497717689


In [286]:
# Evaluation
for model in [lr, rf, dt]:
    mse, mae, r2 = evaluate_model(model, Xtest, Ytest)
    print(f"Model: {model.__class__.__name__}, MSE: {mse}, MAE: {mae}, R2: {r2}")


Model: LinearRegression, MSE: 3.1491736604730445, MAE: 1.3835550330115471, R2: 0.9364618240613234
Model: RandomForestRegressor, MSE: 0.4230642169755856, MAE: 0.3867026642324474, R2: 0.9914641961512166
Model: DecisionTreeRegressor, MSE: 0.9942445152706006, MAE: 0.4735278645913915, R2: 0.9799399811670474


In [30]:
#creating a function 
def process_file(file_path):
    
    # Load the dataset
    data = pd.read_csv(file_path)

    # Select columns with less than 30% null values
    greater_than = []
    less_than = []
    for i in data.columns:
        if (data[i].isnull().sum() < (0.3 * (data.shape[0]))):
            greater_than.append(i)
        else:
            less_than.append(i)

    data = data[greater_than]

    # Split the data into numeric and non-numeric
    numeric_data = data.select_dtypes(include=np.number)
    non_numeric = data.select_dtypes(include=['object'])

    # Correlation matrix to find important features 
    corr_matrix = numeric_data.corr()
    corr_matrix = corr_matrix["overall"].sort_values(ascending=False)
    high_corr_features = corr_matrix[abs(corr_matrix) > 0.4].index
    low_corr_features = corr_matrix[abs(corr_matrix) < 0.4]

    # Drop specified columns
    columns_to_drop = ['player_id','sofifa_id' ,'nationality_id', 'league_id', 'club_team_id', 'club_jersey_number', 
                       'fifa_version', 'weight_kg', 'movement_balance', 'club_contract_valid_until_year', 
                       'height_cm', 'fifa_update','nation_team_id']
    numeric_data.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

    # Impute missing values in numeric data
    imp = IterativeImputer(max_iter=10, random_state=0)
    numeric_data = pd.DataFrame(np.round(imp.fit_transform(numeric_data)), columns=numeric_data.columns)

    # Drop specified non-numeric columns
    columns_to_drop = ['player_url', 'fifa_update_date', 'player_face_url', 'dob', 'short_name', 
                       'long_name', 'league_name', 'club_name', 'club_position', 'club_joined_date', 
                       'real_face', 'body_type']
    non_numeric.drop(columns=columns_to_drop, axis=1, inplace=True, errors='ignore')

    # Define a simple imputer for categorical data
    cat_pipe = Pipeline([
         ("impute", SimpleImputer(strategy="most_frequent")),
    ])
    
    # Apply the imputer to non-numeric data
    full_pipe = ColumnTransformer([("cat", cat_pipe, make_column_selector(dtype_include=object))])

    non_numeric = pd.DataFrame(full_pipe.fit_transform(non_numeric), columns=non_numeric.columns)

    
    # Drop additional specified columns
    additional_columns_to_drop = ['sofifa_id','player_url','player_face_url','dob','short_name', 'long_name',
                                  'league_name','club_team_id','club_jersey_number','club_loaned_from',
                                  'nationality_id','nation_team_id','nation_jersey_number','real_face',
                                  'body_type','release_clause_eur','player_tags','player_traits',
                                  'mentality_composure','nation_position', 'goalkeeping_speed','club_joined',
                                  'club_contract_valid_until']
    non_numeric.drop(columns=additional_columns_to_drop, axis=1, inplace=True, errors='ignore')
   
    # Encode non-numeric data
    encoder = ce.BinaryEncoder(cols=non_numeric.columns)
    non_numeric = encoder.fit_transform(non_numeric)

    # Concatenate the data into a single DataFrame
    processedDF = pd.concat([numeric_data, non_numeric], axis=1)

    return processedDF


In [122]:
players_22 = process_file('players_22.csv')

In [121]:
players_22.to_csv('New_data.csv', index=False)

In [67]:
#selecting new X for training and Y
X_new = players_22.drop(columns=['overall'])  # Features
scaler = StandardScaler()
X_new = scaler.fit_transform(X_new)
y_new = players_22['overall']  # Target variable

In [113]:
# Load the best model which was the RandomForestRegressor
with open("RandomForestRegressorV2.actual.pkl", 'rb') as file:
    best_model = pkl.load(file)


with open("DecisionTreeRegressorV2.actual.pkl", 'rb') as file:
    model_2 = pkl.load(file)

In [114]:
# Measure performance on the new dataset
new_predictions = best_model.predict(X_new)
new_mse = mean_squared_error(y_new, new_predictions)
print(f'New Data Mean Squared Error: {new_mse}')



new_predictions = model_2.predict(X_new)
new_mse = mean_squared_error(y_new, new_predictions)
print(f'New Data Mean Squared Error: {new_mse}')

New Data Mean Squared Error: 2.9067601590519256
New Data Mean Squared Error: 4.7315349030614895


In [115]:
mse, mae, r2 = evaluate_model(best_model,X_new ,y_new )
    
print(f"Model: {best_model.__class__.__name__}, MSE: {mse}, MAE: {mae}, R2: {r2}")


mse, mae, r2 = evaluate_model(model_2,X_new ,y_new )
    
print(f"Model: {model_2.__class__.__name__}, MSE: {mse}, MAE: {mae}, R2: {r2}")





Model: RandomForestRegressor, MSE: 2.9067601590519256, MAE: 1.144561047871511, R2: 0.9385918986433325
Model: DecisionTreeRegressor, MSE: 4.7315349030614895, MAE: 1.4256458235875045, R2: 0.9000417788185944


In [123]:
# Load the trained model
with open('DecisionTreeRegressorV2.actual.pkl', 'rb') as file:
    model = pkl.load(file)

    

In [124]:
# Define the prediction function
def predict_player_rating(features):
    prediction = model.predict(np.array(features).reshape(1, -1))
    return prediction[0]

In [125]:
# Streamlit app interface
st.title('Player Rating Prediction')

# Input features from user
features_list = [ 'potential', 'value_eur', 
    'wage_eur', 'age', 'league_level', 'weak_foot', 'skill_moves', 
    'international_reputation', 'pace', 'shooting', 'passing', 'dribbling', 
    'defending', 'physic', 'attacking_crossing', 'attacking_finishing', 
    'attacking_heading_accuracy', 'attacking_short_passing', 
    'attacking_volleys', 'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 
    'skill_long_passing', 'skill_ball_control', 'movement_acceleration', 
    'movement_sprint_speed', 'movement_agility', 'movement_reactions', 
    'power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 
    'power_long_shots', 'mentality_aggression', 'mentality_interceptions', 
    'mentality_positioning', 'mentality_vision', 'mentality_penalties', 
    'mentality_composure', 'defending_marking_awareness', 
    'defending_standing_tackle', 'defending_sliding_tackle', 
    'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 
    'goalkeeping_positioning', 'goalkeeping_reflexes','cat__player_positions', 'cat__nationality_name', 'cat__preferred_foot', 
    'cat__work_rate', 'cat__ls', 'cat__st', 'cat__rs', 'cat__lw', 'cat__lf', 
    'cat__cf', 'cat__rf', 'cat__rw', 'cat__lam', 'cat__cam', 'cat__ram', 
    'cat__lm', 'cat__lcm', 'cat__cm', 'cat__rcm', 'cat__rm', 'cat__lwb', 
    'cat__ldm', 'cat__cdm', 'cat__rdm', 'cat__rwb', 'cat__lb', 'cat__lcb', 
    'cat__cb', 'cat__rcb', 'cat__rb', 'cat__gk'
]
input_data = []

for feature in features_list:
    value = st.number_input(f'Enter {feature}', value=0.0)
    input_data.append(value)

if st.button('Predict Rating'):
    rating = predict_player_rating(input_data)
    st.write(f'Predicted Player Rating: {rating}')