FIFA Player Data Analysis and Modeling


Data Preprocessing

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import xgboost as xgb
import pickle
import joblib

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
#Inspecting Data
legacy_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/male_players (legacy).csv')
print(legacy_df.head())
print(legacy_df.describe())
print(legacy_df.info())

  legacy_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/male_players (legacy).csv')


   player_id                                        player_url  fifa_version  \
0     158023                /player/158023/lionel-messi/150002            15   
1      20801  /player/20801/c-ronaldo-dos-santos-aveiro/150002            15   
2       9014                  /player/9014/arjen-robben/150002            15   
3      41236           /player/41236/zlatan-ibrahimovic/150002            15   
4     167495                /player/167495/manuel-neuer/150002            15   

   fifa_update fifa_update_date         short_name  \
0            2       2014-09-18           L. Messi   
1            2       2014-09-18  Cristiano Ronaldo   
2            2       2014-09-18          A. Robben   
3            2       2014-09-18     Z. Ibrahimović   
4            2       2014-09-18           M. Neuer   

                             long_name player_positions  overall  potential  \
0       Lionel Andrés Messi Cuccittini               CF       93         95   
1  Cristiano Ronaldo dos Santos Avei

In [9]:
#Dropping Columns with More Than 30% Null Values
threshold = 0.3
total_rows = len(legacy_df)
legacy_df = legacy_df.loc[:, legacy_df.isnull().mean() < threshold]
print(f"Data shape after dropping columns with >{threshold*100}% null values: {legacy_df.shape}")

Data shape after dropping columns with >30.0% null values: (161583, 102)


In [10]:
#Dropping Irrelevant Columns
useless_columns = ['player_id','player_url','player_face_url','short_name','long_name','fifa_version','fifa_update',
                   'fifa_update_date','dob','league_name','league_id','club_jersey_number','club_joined_date',
                   'club_contract_valid_until_year','nationality_id','nationality_name','preferred_foot',
                   'real_face','club_team_id','league_level']
legacy_df = legacy_df.drop(columns=useless_columns)
print("Remaining columns:", legacy_df.columns)


Remaining columns: Index(['player_positions', 'overall', 'potential', 'value_eur', 'wage_eur',
       'age', 'height_cm', 'weight_kg', 'club_name', 'club_position',
       'weak_foot', 'skill_moves', 'international_reputation', 'work_rate',
       'body_type', 'pace', 'shooting', 'passing', 'dribbling', 'defending',
       'physic', 'attacking_crossing', 'attacking_finishing',
       'attacking_heading_accuracy', 'attacking_short_passing',
       'attacking_volleys', 'skill_dribbling', 'skill_curve',
       'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control',
       'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
       'movement_reactions', 'movement_balance', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
   

In [11]:
#Numeric and Categorical Data
numeric_data = legacy_df.select_dtypes(include=['int64', 'float64'])
categorical_data = legacy_df.select_dtypes(exclude=['int64', 'float64'])
print("Numeric data info:", numeric_data.info())
print("Textual data info:", categorical_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161583 entries, 0 to 161582
Data columns (total 50 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   overall                      161583 non-null  int64  
 1   potential                    161583 non-null  int64  
 2   value_eur                    159530 non-null  float64
 3   wage_eur                     159822 non-null  float64
 4   age                          161583 non-null  int64  
 5   height_cm                    161583 non-null  int64  
 6   weight_kg                    161583 non-null  int64  
 7   weak_foot                    161583 non-null  int64  
 8   skill_moves                  161583 non-null  int64  
 9   international_reputation     161583 non-null  int64  
 10  pace                         143614 non-null  float64
 11  shooting                     143614 non-null  float64
 12  passing                      143614 non-null  float64
 13 

In [12]:
#Imputing Missing Values
numeric_data.fillna(numeric_data.mean(), inplace=True)

In [13]:
for column in categorical_data:
    categorical_data[column].fillna(categorical_data[column].mode()[0], inplace=True)

In [14]:
#Processing the 'gk' Column
# Replacing NaN with value
categorical_data['gk'].fillna('0+0', inplace=True)

In [15]:
# Splitting gk column
categorical_data[['gk_base', 'gk_modifier']] = categorical_data['gk'].str.split('+', expand=True)

In [16]:
#Adding the splits
categorical_data['gk_combined'] = categorical_data['gk_base'] + categorical_data['gk_modifier']

In [17]:
# Dropping the original column
categorical_data.drop(columns=['gk', 'gk_base', 'gk_modifier'], inplace=True)

In [18]:
#encode cateogrical data
label_encoder = LabelEncoder()
for column in categorical_data:
    categorical_data[column] = label_encoder.fit_transform(categorical_data[column])

In [19]:
#New and cleaned Dataframe
cleaned_legacy = pd.concat([numeric_data, categorical_data], axis=1)
print("Cleaned data shape:", cleaned_legacy.shape)

Cleaned data shape: (161583, 82)


Feature Engineernig

In [20]:
X = cleaned_legacy.drop('overall', axis=1)
y = cleaned_legacy['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
#Using Random Forest Clssifier to get feature importances
rforest_classifier = RandomForestClassifier(n_estimators=112, max_depth=12, criterion='entropy')
rforest_classifier.fit(X_train, y_train)
feature_importance = rforest_classifier.feature_importances_

In [22]:
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df.head(20))

               Feature  Importance
1            value_eur    0.110534
0            potential    0.058887
28  movement_reactions    0.042117
78                 rcb    0.041225
76                 lcb    0.039443
77                  cb    0.037792
69                  rm    0.033286
65                  lm    0.029624
2             wage_eur    0.027103
75                  lb    0.024395
79                  rb    0.024133
59                  cf    0.023851
58                  lf    0.022347
3                  age    0.022179
60                  rf    0.021636
62                 lam    0.020978
63                 cam    0.020517
54                  ls    0.019971
13           defending    0.019098
56                  rs    0.019017


In [23]:
#Top Correlated Features
selected_features = [
    'movement_reactions', 'potential', 'passing', 'wage_eur', 'value_eur',
    'dribbling', 'attacking_short_passing', 'international_reputation', 'skill_long_passing',
    'physic', 'age', 'skill_ball_control', 'shooting', 'skill_curve', 'weak_foot',
    'skill_moves', 'skill_dribbling', 'attacking_finishing'
]

In [24]:
# Standardize selected features
scaler = StandardScaler()
X = cleaned_legacy[selected_features]
X_scaled = scaler.fit_transform(X)
y = cleaned_legacy['overall']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

Training And Evaluating Models

In [26]:
#Random Forest Regressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
initial_predictions = model.predict(X_test)
initial_rmse = np.sqrt(mean_squared_error(y_test, initial_predictions))
print(f"Initial RMSE (RandomForest): {initial_rmse}")

Initial RMSE (RandomForest): 0.8650163457763402


In [27]:
# Hyperparameter tuning for RandomForestRegressor
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
}
grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=4)
grid_search.fit(X_train, y_train)

In [28]:
#best model
best_model = grid_search.best_estimator_
rmse_scores = np.sqrt(-cross_val_score(best_model, X, y, scoring='neg_mean_squared_error', cv=5))
print(f"Mean RMSE (RandomForest with GridSearch): {np.mean(rmse_scores)}")


Mean RMSE (RandomForest with GridSearch): 1.2573534601513245


In [29]:
#XGBoost Model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_initial_predictions = xgb_model.predict(X_test)
xgb_initial_rmse = np.sqrt(mean_squared_error(y_test, xgb_initial_predictions))
print(f"Initial RMSE (XGBoost): {xgb_initial_rmse}")

Initial RMSE (XGBoost): 0.9357982258867391


In [30]:
# Hyperparameter tuning for XGBoost
param_grid_xgb = {
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 500],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3],
    'gamma': [0, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [31]:
scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)
cv = KFold(n_splits=3, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb_model, param_grid=param_grid_xgb, scoring=scorer, cv=cv)
xgb_grid_search.fit(X_train, y_train)

In [32]:
#best XGBoost model
xgb_best_model = xgb_grid_search.best_estimator_
xgb_cv_scores = cross_val_score(xgb_best_model, X, y, cv=cv, scoring=scorer)
xgb_rmse_scores = -xgb_cv_scores
print(f"Mean RMSE (XGBoost with GridSearch): {np.mean(xgb_rmse_scores)}")

Mean RMSE (XGBoost with GridSearch): 0.9196351624281256


In [33]:
#Gradient Boosting Regressor
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
gb_initial_predictions = gb_model.predict(X_test)
gb_initial_rmse = np.sqrt(mean_squared_error(y_test, gb_initial_predictions))
print(f"Initial RMSE (GradientBoosting): {gb_initial_rmse}")

Initial RMSE (GradientBoosting): 1.2114824134971478


In [34]:
# Hyperparameter tuning
param_grid_gb = {
    'n_estimators': [150, 200],
    'learning_rate': [0.1, 0.2],
    'max_depth': [5, 7],
    'min_samples_split': [2, 5]
}
gb_grid_search = GridSearchCV(gb_model, param_grid_gb, scoring=scorer, cv=cv)
gb_grid_search.fit(X_train, y_train)

In [35]:
#best GradientBoosting model
gb_best_model = gb_grid_search.best_estimator_
gb_cv_scores = cross_val_score(gb_best_model, X, y, cv=cv, scoring=scorer)
gb_rmse_scores = -gb_cv_scores
print(f"Mean RMSE (GradientBoosting with GridSearch): {np.mean(gb_rmse_scores)}")

Mean RMSE (GradientBoosting with GridSearch): 0.8903359260259416


Testing and Deployment

In [36]:
#new dataset
players_22 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/players_22-1.csv')


  players_22 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/players_22-1.csv')


In [37]:
#Dropping Columns with More Than 30% Null Values
threshold = 0.3
total_rows = len(players_22)

players_22 = players_22.loc[:, players_22.isnull().mean() < threshold]
print(f"Data shape after dropping columns with >{threshold*100}% null values: {players_22.shape}")

Data shape after dropping columns with >30.0% null values: (19239, 102)


In [38]:
# List of columns to drop
useless_columns22 = [
    'sofifa_id', 'player_url', 'long_name', 'dob', 'short_name', 'body_type', 'real_face',
    'player_face_url', 'club_logo_url', 'nation_flag_url', 'club_flag_url'
]
players_22 = players_22.drop(useless_columns22, axis=1)

In [39]:
#numeric and categorical columns
numeric_data1 = players_22.select_dtypes(include=['int64', 'float64'])
categorical_data1 = players_22.select_dtypes(exclude=['int64', 'float64'])

In [40]:
numeric_data1.info()
categorical_data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Data columns (total 56 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   overall                      19239 non-null  int64  
 1   potential                    19239 non-null  int64  
 2   value_eur                    19165 non-null  float64
 3   wage_eur                     19178 non-null  float64
 4   age                          19239 non-null  int64  
 5   height_cm                    19239 non-null  int64  
 6   weight_kg                    19239 non-null  int64  
 7   club_team_id                 19178 non-null  float64
 8   league_level                 19178 non-null  float64
 9   club_jersey_number           19178 non-null  float64
 10  club_contract_valid_until    19178 non-null  float64
 11  nationality_id               19239 non-null  int64  
 12  weak_foot                    19239 non-null  int64  
 13  skill_moves     

In [41]:
#Imputing Missing Values
numeric_data1.fillna(numeric_data1.mean(), inplace=True)

In [42]:
for column in categorical_data1:
    categorical_data1[column].fillna(categorical_data1[column].mode()[0], inplace=True)

In [43]:
#remaining null values
print(numeric_data1.isnull().sum())
print(categorical_data1.isnull().sum())

overall                        0
potential                      0
value_eur                      0
wage_eur                       0
age                            0
height_cm                      0
weight_kg                      0
club_team_id                   0
league_level                   0
club_jersey_number             0
club_contract_valid_until      0
nationality_id                 0
weak_foot                      0
skill_moves                    0
international_reputation       0
release_clause_eur             0
pace                           0
shooting                       0
passing                        0
dribbling                      0
defending                      0
physic                         0
attacking_crossing             0
attacking_finishing            0
attacking_heading_accuracy     0
attacking_short_passing        0
attacking_volleys              0
skill_dribbling                0
skill_curve                    0
skill_fk_accuracy              0
skill_long

In [44]:
#Encoding
label_encoder = LabelEncoder()
for column in categorical_data1:
    categorical_data1[column] = label_encoder.fit_transform(categorical_data1[column])

In [45]:
#Combining Numeric and Categorical Data into one dataframe
new_players_22 = pd.concat([numeric_data1, categorical_data1], axis=1)
print("Cleaned data shape:", new_players_22.shape)

Cleaned data shape: (19239, 91)


In [53]:
#Data Preparation for Prediction
selected_features = [
    'movement_reactions', 'potential', 'passing', 'wage_eur', 'value_eur',
    'dribbling', 'attacking_short_passing', 'international_reputation', 'skill_long_passing',
    'physic', 'age', 'skill_ball_control', 'shooting', 'skill_curve','weak_foot','skill_moves','skill_dribbling','attacking_finishing'
]
scaler = StandardScaler()
X = new_players_22[selected_features]
X_scaled = scaler.fit_transform(X)


In [81]:
# Define the directory to save the models
model_directory = '/content/drive/My Drive/Colab Notebooks/'

In [82]:
# Save the models
filename_rf = model_directory + 'random_forest_model.sav'
joblib.dump(best_model, filename_rf)

filename_xgb = model_directory + 'xgboost_model.sav'
joblib.dump(xgb_best_model, filename_xgb)

filename_gb = model_directory + 'gradient_boosting_model.sav'
joblib.dump(gb_best_model, filename_gb)

['/content/drive/My Drive/Colab Notebooks/gradient_boosting_model.sav']

In [83]:
# Make predictions
rf_predictions = best_model.predict(X_scaled)
xgb_predictions = xgb_best_model.predict(X_scaled)
gb_predictions = gb_best_model.predict(X_scaled)

In [84]:
actual_data = new_players_22['overall']

rf_rmse_new = np.sqrt(mean_squared_error(actual_data, rf_predictions))
xgb_rmse_new = np.sqrt(mean_squared_error(actual_data, xgb_predictions))
gb_rmse_new = np.sqrt(mean_squared_error(actual_data, gb_predictions))

print("Random Forest RMSE on new data:", rf_rmse_new)
print("XGBoost RMSE on new data:", xgb_rmse_new)
print("Gradient Boosting RMSE on new data:", gb_rmse_new)

Random Forest RMSE on new data: 1.2878936536104464
XGBoost RMSE on new data: 1.1600579040595962
Gradient Boosting RMSE on new data: 1.0782143299956066
