In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
# please change the path here
fifa = pd.read_excel(r"/content/sample_data/FIFA_Dataset.xlsx")

In [None]:
fifa.drop(columns=['player_id','player_positions','goalkeeping_speed','short_name','dob','league_name','league_id','league_level','club_team_id','club_name','club_position','club_jersey_number','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','preferred_foot','work_rate','body_type','player_tags','player_traits'], inplace=True)

In [None]:
fifa.isnull().sum()[fifa.isnull().sum() >0]

pace         141
shooting     141
passing      141
dribbling    141
defending    141
physic       141
dtype: int64

In [None]:
fifa.fillna(value=0, inplace=True)

In [None]:
fifa.isnull().sum().sum()

0

In [None]:
fifa.select_dtypes(include=object).columns

Index(['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
       'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb',
       'lcb', 'cb', 'rcb', 'rb', 'gk'],
      dtype='object')

In [None]:
fifa[['ls', 'st', 'rs', 'lw', 'lf', 'cf',
       'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb',
       'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']].head()

Unnamed: 0,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,83+3,83+3,83+3,87,87,87,87,87,89+2,89+2,...,79+3,79+3,79+3,78+3,74+3,68+3,68+3,68+3,74+3,21+3
1,86+3,86+3,86+3,88,88,88,88,88,88+2,88+2,...,68+3,68+3,68+3,71+3,67+3,58+3,58+3,58+3,67+3,22+3
2,88+1,88+1,88+1,85,87,87,87,85,84+3,84+3,...,58+3,58+3,58+3,61+3,57+3,52+3,52+3,52+3,57+3,20+3
3,76+3,76+3,76+3,73,76,76,76,73,76+3,76+3,...,86+3,86+3,86+3,81+3,82+3,87+2,87+2,87+2,82+3,21+3
4,87+2,87+2,87+2,84,86,86,86,84,85+3,85+3,...,68+3,68+3,68+3,67+3,63+3,61+3,61+3,61+3,63+3,20+3


In [None]:
def convertSkills(value):
    if type(value) == str:
        s1 = value[0:2]
        s2 = value[-1]
        value = int(s1) + int(s2)
        return value

    else:
        return value

In [None]:
col_skill = ['ls', 'st', 'rs', 'lw', 'lf', 'cf',
       'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb',
       'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
for col in col_skill:
    fifa[col] = fifa[col].apply(convertSkills)

In [None]:
fifa[['ls', 'st', 'rs', 'lw', 'lf', 'cf',
       'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb',
       'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']].head()

Unnamed: 0,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,86,86,86,87,87,87,87,87,91,91,...,82,82,82,81,77,71,71,71,77,24
1,89,89,89,88,88,88,88,88,90,90,...,71,71,71,74,70,61,61,61,70,25
2,89,89,89,85,87,87,87,85,87,87,...,61,61,61,64,60,55,55,55,60,23
3,79,79,79,73,76,76,76,73,79,79,...,89,89,89,84,85,89,89,89,85,24
4,89,89,89,84,86,86,86,84,88,88,...,71,71,71,70,66,64,64,64,66,23


In [None]:
def age_numeric(val):
    if 16 <= val <= 19:
        return 0
    elif 20 <= val <= 25:
        return 1
    elif 26 <= val <= 30:
        return 2
    else:
        return 3
fifa['age'] = fifa['age'].apply(age_numeric)

In [None]:
df_age = fifa.copy()
df_age.fillna(0, inplace=True)
X_Age = df_age.drop(columns=['age'])
X_Age = pd.get_dummies(X_Age)
y_Age = df_age['age']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_Age, y_Age, test_size=0.2, random_state=42)

age_acc = 0
age_recall = 0
age_support = 0
age_f1 = 0
print("Task 1: Age")
for category in range(4):
    mod = LogisticRegression()
    mod.fit(X_train, (y_train == category).astype(int))

    predictions = mod.predict(X_test)

    report = classification_report((y_test == category).astype(int), predictions, output_dict=True)
    print(f"Results for category {category} using Logistic Regression for age:")
    print(classification_report((y_test == category).astype(int), predictions))
    print("="*50)
    print('\n')

    age_acc += report['accuracy']
    age_recall += report['macro avg']['recall']
    age_support += report['macro avg']['support']
    age_f1 += report['macro avg']['f1-score']

age_acc /= 4
age_recall /= 4
age_support /= 4
age_f1 /= 4

print("Overall Metrics for Age using Logistic Regression:")
print(f"Overall Accuracy: {age_acc:.2%}")
print(f"Overall Recall: {age_recall:.2%}")
print(f"Overall Support: {age_support:.2f}")
print(f"Overall F1-score: {age_f1:.2%}")



Task 1: Age
Results for category 0 using Logistic Regression for age:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       207
           1       0.76      0.84      0.80        45

    accuracy                           0.92       252
   macro avg       0.86      0.89      0.88       252
weighted avg       0.93      0.92      0.93       252



Results for category 1 using Logistic Regression for age:
              precision    recall  f1-score   support

           0       0.66      0.84      0.74       148
           1       0.62      0.38      0.48       104

    accuracy                           0.65       252
   macro avg       0.64      0.61      0.61       252
weighted avg       0.65      0.65      0.63       252





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Results for category 2 using Logistic Regression for age:
              precision    recall  f1-score   support

           0       0.78      0.89      0.83       185
           1       0.49      0.30      0.37        67

    accuracy                           0.73       252
   macro avg       0.63      0.59      0.60       252
weighted avg       0.70      0.73      0.71       252



Results for category 3 using Logistic Regression for age:
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       216
           1       0.79      0.42      0.55        36

    accuracy                           0.90       252
   macro avg       0.85      0.70      0.74       252
weighted avg       0.89      0.90      0.89       252



Overall Metrics for Age using Logistic Regression:
Overall Accuracy: 80.16%
Overall Recall: 69.90%
Overall Support: 252.00
Overall F1-score: 70.70%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:

cat_report = []
accur_score = []


for category in range(4):

    gb_mod= GradientBoostingClassifier()
    gb_mod.fit(X_train, (y_train == category).astype(int))


    predictions = gb_mod.predict(X_test)

    report = classification_report((y_test == category).astype(int), predictions, output_dict=True)
    cat_report.append(report)

    accuracy = accuracy_score((y_test == category).astype(int), predictions)
    accur_score.append(accuracy)

for cat, report in enumerate(cat_report):
    print(f"Results for category {cat} for age using Gradient Boosting:")
    print(classification_report((y_test == category).astype(int), predictions))
    print("="*50)
    print('\n')

age_accur = sum(accur_score) / len(accur_score)
age_recall = sum(report['macro avg']['recall'] for report in cat_report) / len(cat_report)
age_support = sum(report['macro avg']['support'] for report in cat_report) / len(cat_report)
age_f1 = sum(report['macro avg']['f1-score'] for report in cat_report) / len(cat_report)

print("Overall Metrics for Age using Gradient Boosting:")
print(f"Overall Accuracy: {age_accur * 100:.2f}%")
print(f"Overall Recall: {age_recall:.2%}")
print(f"Overall Support: {age_support:.2f}")
print(f"Overall F1-score: {age_f1:.2%}")

Results for category 0 for age using Gradient Boosting:
              precision    recall  f1-score   support

           0       0.87      0.96      0.92       216
           1       0.43      0.17      0.24        36

    accuracy                           0.85       252
   macro avg       0.65      0.56      0.58       252
weighted avg       0.81      0.85      0.82       252



Results for category 1 for age using Gradient Boosting:
              precision    recall  f1-score   support

           0       0.87      0.96      0.92       216
           1       0.43      0.17      0.24        36

    accuracy                           0.85       252
   macro avg       0.65      0.56      0.58       252
weighted avg       0.81      0.85      0.82       252



Results for category 2 for age using Gradient Boosting:
              precision    recall  f1-score   support

           0       0.87      0.96      0.92       216
           1       0.43      0.17      0.24        36

    accura

In [None]:
cat_report_xgb = []
accur_score_xgb = []

for category in range(4):
    xgb_mod = XGBClassifier()
    xgb_mod.fit(X_train, (y_train == category).astype(int))
    predictions_xgb = xgb_mod.predict(X_test)
    report_xgb = classification_report((y_test == category).astype(int), predictions_xgb, output_dict=True)
    cat_report_xgb.append(report_xgb)
    accuracy_xgb = accuracy_score((y_test == category).astype(int), predictions_xgb)
    accur_score_xgb.append(accuracy_xgb)

for cat, report_xgb in enumerate(cat_report_xgb):
    print(f"Results for category {cat} using XGBoost for age:")
    print(classification_report((y_test == cat).astype(int), predictions_xgb))
    print("="*50)
    print('\n')

age_accur_xgb = sum(accur_score_xgb) / len(accur_score_xgb)
age_recall_xgb = sum(report_xgb['macro avg']['recall'] for report_xgb in cat_report_xgb) / len(cat_report_xgb)
age_support_xgb = sum(report_xgb['macro avg']['support'] for report_xgb in cat_report_xgb) / len(cat_report_xgb)
age_f1_xgb = sum(report_xgb['macro avg']['f1-score'] for report_xgb in cat_report_xgb) / len(cat_report_xgb)

print("Overall Metrics for Age using XGBoost:")
print(f"Overall Accuracy: {age_accur_xgb * 100:.2f}%")
print(f"Overall Recall: {age_recall_xgb:.2%}")
print(f"Overall Support: {age_support_xgb:.2f}")
print(f"Overall F1-score: {age_f1_xgb:.2%}")


Results for category 0 using XGBoost for age:
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       207
           1       0.00      0.00      0.00        45

    accuracy                           0.74       252
   macro avg       0.40      0.45      0.43       252
weighted avg       0.66      0.74      0.70       252



Results for category 1 using XGBoost for age:
              precision    recall  f1-score   support

           0       0.56      0.87      0.68       148
           1       0.05      0.01      0.02       104

    accuracy                           0.52       252
   macro avg       0.30      0.44      0.35       252
weighted avg       0.35      0.52      0.41       252



Results for category 2 using XGBoost for age:
              precision    recall  f1-score   support

           0       0.75      0.94      0.83       185
           1       0.40      0.12      0.18        67

    accuracy                           0

In [None]:
def overall_numeric(val):
    if 91 <= val <= 100:
        return 0
    elif 81 <= val <= 90:
        return 1
    elif 71 <= val <= 80:
        return 2
    else:
        return 3
fifa['overall'] = fifa['overall'].apply(overall_numeric)

In [None]:
df_overall = fifa.copy()
df_overall.fillna(0, inplace=True)
X_Overall = df_overall.drop(columns=['overall'])
X_Overall = pd.get_dummies(X_Overall)
y_Overall = df_overall['overall']
df_overall.corr().abs()['overall'].sort_values(ascending=False)

overall                     1.000000
movement_reactions          0.830904
wage_eur                    0.806546
mentality_composure         0.683750
international_reputation    0.629784
                              ...   
goalkeeping_reflexes        0.038501
goalkeeping_kicking         0.036078
goalkeeping_diving          0.035892
goalkeeping_positioning     0.034675
goalkeeping_handling        0.029409
Name: overall, Length: 76, dtype: float64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_Overall, y_Overall, test_size=0.2, random_state=42)

over_acc = 0
over_recall = 0
over_support = 0
over_f1 = 0
print("Task 1: Overall")
for category in range(4):
    mod = LogisticRegression()
    mod.fit(X_train, (y_train == category).astype(int))

    predictions = mod.predict(X_test)

    report = classification_report((y_test == category).astype(int), predictions, output_dict=True)
    print(f"Results for category {category} using Logistic Regression for Overall:")
    print(classification_report((y_test == category).astype(int), predictions))
    print("="*50)
    print('\n')

    over_acc += report['accuracy']
    over_recall += report['macro avg']['recall']
    over_support += report['macro avg']['support']
    over_f1 += report['macro avg']['f1-score']

over_acc /= 4
over_recall /= 4
over_support /= 4
over_f1 /= 4

print("Overall Metrics for Overall using Logistic Regression:")
print(f"Overall Accuracy: {over_acc:.2%}")
print(f"Overall Recall: {over_recall:.2%}")
print(f"Overall Support: {over_support:.2f}")
print(f"Overall F1-score: {over_f1:.2%}")



Task 1: Overall
Results for category 0 using Logistic Regression for Overall:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       252

    accuracy                           1.00       252
   macro avg       1.00      1.00      1.00       252
weighted avg       1.00      1.00      1.00       252



Results for category 1 using Logistic Regression for Overall:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       228
           1       0.83      0.79      0.81        24

    accuracy                           0.96       252
   macro avg       0.90      0.89      0.89       252
weighted avg       0.96      0.96      0.96       252



Results for category 2 using Logistic Regression for Overall:
              precision    recall  f1-score   support

           0       0.87      0.75      0.81       138
           1       0.74      0.86      0.80       114

    accuracy                  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:

cat_report = []
accur_score = []


for category in range(4):

    gb_mod= GradientBoostingClassifier()
    gb_mod.fit(X_train, (y_train == category).astype(int))


    predictions = gb_mod.predict(X_test)

    report = classification_report((y_test == category).astype(int), predictions, output_dict=True)
    cat_report.append(report)

    accuracy = accuracy_score((y_test == category).astype(int), predictions)
    accur_score.append(accuracy)

for cat, report in enumerate(cat_report):
    print(f"Results for category {cat} for Overall using Gradient Boosting:")
    print(classification_report((y_test == category).astype(int), predictions))
    print("="*50)
    print('\n')

over_accur = sum(accur_score) / len(accur_score)
over_recall = sum(report['macro avg']['recall'] for report in cat_report) / len(cat_report)
over_support = sum(report['macro avg']['support'] for report in cat_report) / len(cat_report)
over_f1 = sum(report['macro avg']['f1-score'] for report in cat_report) / len(cat_report)

print("Overall Metrics for Overall using Gradient Boosting:")
print(f"Overall Accuracy: {over_accur * 100:.2f}%")
print(f"Overall Recall: {over_recall:.2%}")
print(f"Overall Support: {over_support:.2f}")
print(f"Overall F1-score: {over_f1:.2%}")

Results for category 0 for Overall using Gradient Boosting:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       138
           1       0.98      0.98      0.98       114

    accuracy                           0.98       252
   macro avg       0.98      0.98      0.98       252
weighted avg       0.98      0.98      0.98       252



Results for category 1 for Overall using Gradient Boosting:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       138
           1       0.98      0.98      0.98       114

    accuracy                           0.98       252
   macro avg       0.98      0.98      0.98       252
weighted avg       0.98      0.98      0.98       252



Results for category 2 for Overall using Gradient Boosting:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       138
           1       0.98      0.98      0.98       114

In [None]:
cat_report_xgb = []
accur_score_xgb = []

for category in range(4):
    xgb_mod = XGBClassifier()
    xgb_mod.fit(X_train, (y_train == category).astype(int))
    predictions_xgb = xgb_mod.predict(X_test)
    report_xgb = classification_report((y_test == category).astype(int), predictions_xgb, output_dict=True)
    cat_report_xgb.append(report_xgb)
    accuracy_xgb = accuracy_score((y_test == category).astype(int), predictions_xgb)
    accur_score_xgb.append(accuracy_xgb)

for cat, report_xgb in enumerate(cat_report_xgb):
    print(f"Results for category {cat} using XGBoost for Overall:")
    print(classification_report((y_test == cat).astype(int), predictions_xgb))
    print("="*50)
    print('\n')

overall_accur_xgb = sum(accur_score_xgb) / len(accur_score_xgb)
overall_recall_xgb = sum(report_xgb['macro avg']['recall'] for report_xgb in cat_report_xgb) / len(cat_report_xgb)
overall_support_xgb = sum(report_xgb['macro avg']['support'] for report_xgb in cat_report_xgb) / len(cat_report_xgb)
overall_f1_xgb = sum(report_xgb['macro avg']['f1-score'] for report_xgb in cat_report_xgb) / len(cat_report_xgb)

print("Overall Metrics for Overall using XGBoost:")
print(f"Overall Accuracy: {overall_accur_xgb * 100:.2f}%")
print(f"Overall Recall: {overall_recall_xgb:.2%}")
print(f"Overall Support: {overall_support_xgb:.2f}")
print(f"Overall F1-score: {overall_f1_xgb:.2%}")


Results for category 0 using XGBoost for Overall:
              precision    recall  f1-score   support

           0       1.00      0.56      0.72       252
           1       0.00      0.00      0.00         0

    accuracy                           0.56       252
   macro avg       0.50      0.28      0.36       252
weighted avg       1.00      0.56      0.72       252



Results for category 1 using XGBoost for Overall:
              precision    recall  f1-score   support

           0       0.83      0.51      0.63       228
           1       0.00      0.00      0.00        24

    accuracy                           0.46       252
   macro avg       0.41      0.26      0.32       252
weighted avg       0.75      0.46      0.57       252



Results for category 2 using XGBoost for Overall:
              precision    recall  f1-score   support

           0       0.21      0.21      0.21       138
           1       0.02      0.02      0.02       114

    accuracy                

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
