In [1]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor,VotingRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score


In [2]:
train_21 = pd.read_csv('male_players (legacy).csv')

train_22 = pd.read_csv('players_22.csv')

  train_21 = pd.read_csv('male_players (legacy).csv')
  train_22 = pd.read_csv('players_22.csv')


Checking and dropping variables with more than 30% missing values

In [3]:
threshold = 0.3 * train_21.shape[0]
train_21 = train_21.dropna(thresh=threshold, axis=1)
train_21.drop(columns=['wage_eur','value_eur', 'international_reputation', 'release_clause_eur'], inplace=True)


Selecting Numeric Features

In [4]:
# selecting numerical features
numerical_features = train_21.select_dtypes(include =[np.number])

Multivariable Imputation, Scaling and Normalization on numerical variables

In [5]:
#Simple imputing
imp = SimpleImputer(strategy='mean')
numerical_features = pd.DataFrame(imp.fit_transform(numerical_features), columns=numerical_features.columns, index=numerical_features.index)

#Scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_features)

#Iterative imputing
imp = IterativeImputer(initial_strategy='mean', max_iter=50, random_state=0)
imputed_scaled_data = imp.fit_transform(scaled_data)
imputed_data = scaler.inverse_transform(imputed_scaled_data)

numerical_features = pd.DataFrame(imputed_data, columns = numerical_features.columns, index=numerical_features.index)
#Filling remaining missing values.
numerical_features.fillna(numerical_features.mean())

Unnamed: 0,player_id,fifa_version,fifa_update,overall,potential,age,height_cm,weight_kg,league_id,league_level,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,158023.0,15.0,2.0,93.0,95.0,27.0,169.0,67.0,53.0,1.0,...,76.0,57.816892,25.0,21.0,20.0,6.0,11.0,15.0,14.0,8.0
1,20801.0,15.0,2.0,92.0,92.0,29.0,185.0,80.0,53.0,1.0,...,85.0,57.816892,22.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0
2,9014.0,15.0,2.0,90.0,90.0,30.0,180.0,80.0,19.0,1.0,...,80.0,57.816892,29.0,26.0,26.0,10.0,8.0,11.0,5.0,15.0
3,41236.0,15.0,2.0,90.0,90.0,32.0,195.0,95.0,16.0,1.0,...,91.0,57.816892,25.0,41.0,27.0,13.0,15.0,10.0,9.0,12.0
4,167495.0,15.0,2.0,90.0,90.0,28.0,193.0,92.0,19.0,1.0,...,37.0,57.816892,25.0,25.0,25.0,87.0,85.0,92.0,90.0,86.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161578,269011.0,23.0,2.0,46.0,61.0,18.0,180.0,73.0,2012.0,1.0,...,54.0,40.000000,23.0,21.0,25.0,9.0,13.0,13.0,12.0,7.0
161579,269019.0,23.0,2.0,46.0,58.0,19.0,188.0,83.0,2012.0,1.0,...,31.0,35.000000,50.0,51.0,45.0,6.0,14.0,8.0,13.0,14.0
161580,271093.0,23.0,2.0,46.0,58.0,19.0,181.0,73.0,65.0,1.0,...,37.0,35.000000,36.0,45.0,50.0,8.0,9.0,7.0,14.0,9.0
161581,271555.0,23.0,2.0,46.0,70.0,17.0,175.0,68.0,65.0,1.0,...,63.0,43.000000,19.0,17.0,14.0,13.0,12.0,14.0,7.0,13.0


In [6]:
#Specifying dependent and independent variables
y = numerical_features['overall']
X = numerical_features.drop(columns = ['overall'])

Feature Importance

In [7]:
selector = SelectKBest(score_func=f_regression, k=17)  # Select top 17 features
X_new = selector.fit_transform(X, y)
important_features = X.columns[selector.get_support()]

train_21[important_features]

Unnamed: 0,potential,age,shooting,passing,dribbling,physic,attacking_crossing,attacking_short_passing,skill_curve,skill_long_passing,skill_ball_control,movement_reactions,power_shot_power,power_long_shots,mentality_aggression,mentality_vision,mentality_composure
0,95,27,89.0,86.0,96.0,63.0,84,89,89,76,96,94,80,88,48,90,
1,92,29,93.0,81.0,91.0,79.0,83,82,88,72,92,90,94,93,63,81,
2,90,30,86.0,83.0,92.0,64.0,80,86,85,76,90,89,86,90,47,84,
3,90,32,91.0,81.0,86.0,86.0,76,84,80,76,90,85,93,88,84,83,
4,90,28,,,,,25,42,25,41,31,89,42,25,29,20,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161578,61,18,48.0,40.0,47.0,44.0,48,38,39,33,45,39,64,31,26,42,40.0
161579,58,19,25.0,29.0,34.0,57.0,30,30,24,25,32,42,33,25,46,31,35.0
161580,58,19,36.0,43.0,46.0,53.0,37,51,30,43,35,50,51,32,51,40,35.0
161581,70,17,50.0,36.0,46.0,42.0,29,40,38,26,43,45,42,47,27,44,43.0


In [8]:
#headers = numerical_features.columns

In [9]:
X = pd.DataFrame(X_new, columns=important_features, index = X.index)

Splitting data to training and testing

In [10]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [11]:
dtree = DecisionTreeRegressor(max_depth=15)
lregressor = LinearRegression()
sdg = SGDRegressor(max_iter = 5000)
gb = GradientBoostingRegressor()

### Training Models

In [12]:
for model in (dtree, lregressor, sdg, gb):
    model.fit(Xtrain, Ytrain)

    y_pred = model.predict(Xtest)

    print(f'''\n {model.__class__.__name__}: \n Mean Absolute error: {mean_absolute_error(y_pred, Ytest)} \n Mean Squared Error: {mean_squared_error(y_pred, Ytest)} \n R2_score: {r2_score(y_pred, Ytest)}''')



 DecisionTreeRegressor: 
 Mean Absolute error: 1.0111815404153806 
 Mean Squared Error: 2.4918431731467656 
 R2_score: 0.9487164229986654

 LinearRegression: 
 Mean Absolute error: 1.5912113488288564 
 Mean Squared Error: 4.221647556477041 
 R2_score: 0.9072980732746875

 SGDRegressor: 
 Mean Absolute error: 4681641896038.084 
 Mean Squared Error: 2.3473201698655436e+25 
 R2_score: -14.091125143385373

 GradientBoostingRegressor: 
 Mean Absolute error: 1.0180466202364442 
 Mean Squared Error: 2.0009775082842975 
 R2_score: 0.9568529324162984


In [13]:
#Preparing testing dataset
train_22_cleaned = train_22.select_dtypes(include=[np.number]).dropna(axis=1)
common_features = list(set(X.columns) & set(train_22_cleaned.columns))
    
new_X = X[common_features]
x_test22 = train_22_cleaned[common_features]
y_test22 = train_22_cleaned['overall']
common_features

['attacking_crossing',
 'movement_reactions',
 'mentality_vision',
 'power_long_shots',
 'attacking_short_passing',
 'mentality_aggression',
 'power_shot_power',
 'mentality_composure',
 'potential',
 'age',
 'skill_curve',
 'skill_long_passing',
 'skill_ball_control']

In [14]:
#Training models again with new dataset
for model in (dtree, lregressor,  sdg, gb):
    #Spliting into trainig and testing
    Xtrain,X_test,Ytrain,Y_test=train_test_split(new_X,y,test_size=0.2,random_state=42,stratify=y)
    model.fit(Xtrain, Ytrain)
    
    y_pred = model.predict(X_test)

    print(f'''\n {model.__class__.__name__}: \n Mean Absolute error: {mean_absolute_error(y_pred, Y_test)} \n Mean Squared Error: {mean_squared_error(y_pred, Y_test)} \n R2_score: {r2_score(y_pred, Y_test)}''')



 DecisionTreeRegressor: 
 Mean Absolute error: 1.017275326958655 
 Mean Squared Error: 2.554783878603068 
 R2_score: 0.9475475116703616

 LinearRegression: 
 Mean Absolute error: 1.7161504605753717 
 Mean Squared Error: 4.8847923272298335 
 R2_score: 0.8914508013566176

 SGDRegressor: 
 Mean Absolute error: 3530646405310.5293 
 Mean Squared Error: 1.294613199136135e+25 
 R2_score: -25.93362837390577

 GradientBoostingRegressor: 
 Mean Absolute error: 1.051202659491421 
 Mean Squared Error: 2.159652789506142 
 R2_score: 0.9532256809038674


In [15]:
#Evaluating models on testing dataset
for model in (dtree, lregressor,  sdg, gb):
    X_train,X_test,Y_train,Y_test=train_test_split(new_X,y,test_size=0.2,random_state=42,stratify=y)
    model.fit(X_train, Y_train)
    Y_test = Y_test[:len(x_test22)]
    y_pred = model.predict(x_test22)
    
    print(f'''\n {model.__class__.__name__} \n Mean Absolute error: {mean_absolute_error(y_pred, Y_test)} \n Mean Squared Error: {mean_squared_error(y_pred, Y_test)} \n R2_score: {r2_score(y_pred, Y_test)}''')



 DecisionTreeRegressor 
 Mean Absolute error: 7.84489548624305 
 Mean Squared Error: 96.89744462988111 
 R2_score: -1.0897216609698979

 LinearRegression 
 Mean Absolute error: 7.668894478790563 
 Mean Squared Error: 92.9538767084891 
 R2_score: -1.1872424900506076

 SGDRegressor 
 Mean Absolute error: 305705443857.5611 
 Mean Squared Error: 1.3508624028329941e+23 
 R2_score: -0.510149592445662

 GradientBoostingRegressor 
 Mean Absolute error: 7.7381991558800545 
 Mean Squared Error: 94.25537866334676 
 R2_score: -1.1577355629087158


## Ensemble Learning

### Grid Search with Cross Validation

Using voting Regressor

In [16]:
voting_ensemble_model = VotingRegressor(estimators=[
    ('Decision Tree', dtree),
    ('Linear Regressor', lregressor),
    ('SDG Regressor', sdg),
    ('Gradient Boosting Regressor', gb)
])

X_test = X_test[:len(Y_test)]
voting_ensemble_model.fit(Xtrain, Ytrain)
voting_y_predict = voting_ensemble_model.predict(X_test)


In [17]:
#Evaluating ensembling model
print("Voting Regressor ensemble performance on testing dataset")
for model in (dtree, lregressor,  sdg, gb):
    print(f'''\n {model.__class__.__name__}: \n Mean Absolute error: {mean_absolute_error(Y_test, voting_y_predict)} \n Mean Squared Error: {mean_squared_error(Y_test, voting_y_predict)} \n R2_score: {r2_score(Y_test, voting_y_predict)}''')

Voting Regressor ensemble performance on testing dataset

 DecisionTreeRegressor: 
 Mean Absolute error: 105197463865.04686 
 Mean Squared Error: 1.57965783189954e+22 
 R2_score: -3.233736157171153e+20

 LinearRegression: 
 Mean Absolute error: 105197463865.04686 
 Mean Squared Error: 1.57965783189954e+22 
 R2_score: -3.233736157171153e+20

 SGDRegressor: 
 Mean Absolute error: 105197463865.04686 
 Mean Squared Error: 1.57965783189954e+22 
 R2_score: -3.233736157171153e+20

 GradientBoostingRegressor: 
 Mean Absolute error: 105197463865.04686 
 Mean Squared Error: 1.57965783189954e+22 
 R2_score: -3.233736157171153e+20


In [18]:
best_model = None
best_score = float('inf')
results = {}

for model in (dtree, lregressor,  sdg, gb):
    y_predict = model.predict(X_test)
    mse = mean_squared_error(Y_test, y_predict)
    r2 = r2_score(Y_test, y_predict)
    results[model.__class__.__name__] = {'MSE': mse, 'R2': r2}
    print(f"Model: {model.__class__.__name__}, MSE: {mse}, R2: {r2}")
    if mse < best_score:
        best_score = mse
        best_model = model

print("\nAll Results:")
print(results)

print("\nBest Model:")
print(best_model)

Model: DecisionTreeRegressor, MSE: 2.5689159840608817, R2: 0.9474114182537672
Model: LinearRegression, MSE: 4.871870757885746, R2: 0.9002673597743928
Model: SGDRegressor, MSE: 1.3586242764364495e+23, R2: -2.781255761850613e+21
Model: GradientBoostingRegressor, MSE: 2.124493596215269, R2: 0.956509241311464

All Results:
{'DecisionTreeRegressor': {'MSE': 2.5689159840608817, 'R2': 0.9474114182537672}, 'LinearRegression': {'MSE': 4.871870757885746, 'R2': 0.9002673597743928}, 'SGDRegressor': {'MSE': 1.3586242764364495e+23, 'R2': -2.781255761850613e+21}, 'GradientBoostingRegressor': {'MSE': 2.124493596215269, 'R2': 0.956509241311464}}

Best Model:
GradientBoostingRegressor()


In [19]:
#Testing ensembling model with 2022 dataset
voting_y22_predict = voting_ensemble_model.predict(x_test22)

#Evaluating
for model in (dtree, lregressor,  sdg, gb):
    print(f'''\n {model.__class__.__name__}: \n Mean Absolute error: {mean_absolute_error(voting_y22_predict, y_test22)} \n Mean Squared Error: {mean_squared_error(voting_y22_predict, y_test22)} \n R2_score: {r2_score(voting_y22_predict, y_test22)}''')


 DecisionTreeRegressor: 
 Mean Absolute error: 103285225785.55765 
 Mean Squared Error: 1.515203829051196e+22 
 R2_score: -1.0542499092540267

 LinearRegression: 
 Mean Absolute error: 103285225785.55765 
 Mean Squared Error: 1.515203829051196e+22 
 R2_score: -1.0542499092540267

 SGDRegressor: 
 Mean Absolute error: 103285225785.55765 
 Mean Squared Error: 1.515203829051196e+22 
 R2_score: -1.0542499092540267

 GradientBoostingRegressor: 
 Mean Absolute error: 103285225785.55765 
 Mean Squared Error: 1.515203829051196e+22 
 R2_score: -1.0542499092540267


In [20]:
# Define the parameter grid for each model
params_dtree = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

params_sdg = {
    'loss': ['squared_error', 'epsilon_insensitive', 'huber'],
    'penalty': ['l2', 'l1'], # regularization
    'alpha': [0.0001, 0.001, 0.01],
    'l1_ratio': [0.15, 0.5, 0.85],
    'max_iter': [1000, 5000, 10000],
    'learning_rate': ['constant', 'optimal', 'adaptive']
}

params_gb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7]
}

# Define the cross-validation strategy
cv = KFold(n_splits=3)

# Perform GridSearchCV for each model
grid_search_dtree = GridSearchCV(estimator=dtree, param_grid=params_dtree, cv=cv, n_jobs=-1, verbose=2)
grid_search_sdg = GridSearchCV(estimator=sdg, param_grid=params_sdg, cv=cv, n_jobs=-1, verbose=2)
grid_search_gb = GridSearchCV(estimator=gb, param_grid=params_gb, cv=cv, n_jobs=-1, verbose=2)

# Fit each grid search
grid_search_dtree.fit(X_train, Y_train)
grid_search_sdg.fit(X_train, Y_train)
grid_search_gb.fit(X_train, Y_train)

# Get the best models from grid search
best_dtree = grid_search_dtree.best_estimator_
best_sdg = grid_search_sdg.best_estimator_
best_gb = grid_search_gb.best_estimator_

# Define the VotingRegressor with the best estimators
voting_regressor = VotingRegressor(estimators=[
    ('dtree', best_dtree),
    ('lregressor', lregressor),
    ('sdg', best_sdg),
    ('gb', best_gb)
])

# Training the VotingRegressor
start_time = time.time()
voting_regressor.fit(X_train, Y_train)
search_duration = time.time() - start_time

# Save the model
joblib.dump(voting_regressor, "best_model_predictor.pkl")


# Predict and evaluate
y_pred = voting_regressor.predict(X_test)

print(f"Grid Search Duration: {search_duration} seconds")

print(f'''\n {voting_regressor.__class__.__name__}: \n Mean Absolute error: {mean_absolute_error(y_pred, Y_test)} \n Mean Squared Error: {mean_squared_error(y_pred, Y_test)} \n R2_score: {r2_score(y_pred, Y_test)}''')

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 486 candidates, totalling 1458 fits
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Grid Search Duration: 102.22454833984375 seconds

 VotingRegressor: 
 Mean Absolute error: 1.2225702533220464 
 Mean Squared Error: 2.6029160216864766 
 R2_score: 0.9405331307332433


Testing on FIFA 2022 dataset