# Predicting high-potential FIFA players using individual performance data
Merete Lutz, Jake Barnabe, Simon Frew, Waleed Mahmood

DSCI 522, Group 17

## Introduction

## Methods

### Data

### Analysis

## Results & Discussion

In [1]:
import os 
import requests
import warnings
import zipfile

import numpy as np
import pandas as pd
import altair as alt

from hashlib import sha1
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import loguniform, randint, uniform
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings('ignore', category=FutureWarning)
alt.data_transformers.enable("vegafusion");
pd.set_option('display.max_rows', 200)

In [2]:
# download dataset
# method adapted from: https://github.com/ttimbers/breast_cancer_predictor_py/tree/0.0.1
url = "https://sports-statistics.com/database/fifa/fifa_2022_datasets.zip"

request = requests.get(url)

with open(os.path.join("data", "fifa_2022_datasets.zip"), "wb") as f: 
    f.write(request.content)
    
with zipfile.ZipFile(os.path.join("data", "fifa_2022_datasets.zip"), "r") as zip_file: 
    zip_file.extract("players_22.csv", path="data")
    
os.remove(os.path.join("data", "fifa_2022_datasets.zip"))

# data selection
df_raw = pd.read_csv("data/players_22.csv", encoding="utf-8", low_memory=False)
df_raw

Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,261962,https://sofifa.com/player/261962/defu-song/220002,Song Defu,宋德福,CDM,47,52,70000.0,1000.0,22,...,46+2,46+2,46+2,48+2,15+2,https://cdn.sofifa.net/players/261/962/22_120.png,https://cdn.sofifa.net/teams/112541/60.png,https://cdn.sofifa.net/flags/cn.png,,https://cdn.sofifa.net/flags/cn.png
19235,262040,https://sofifa.com/player/262040/caoimhin-port...,C. Porter,Caoimhin Porter,CM,47,59,110000.0,500.0,19,...,44+2,44+2,44+2,48+2,14+2,https://cdn.sofifa.net/players/262/040/22_120.png,https://cdn.sofifa.net/teams/445/60.png,https://cdn.sofifa.net/flags/ie.png,,https://cdn.sofifa.net/flags/ie.png
19236,262760,https://sofifa.com/player/262760/nathan-logue/...,N. Logue,Nathan Logue-Cunningham,CM,47,55,100000.0,500.0,21,...,45+2,45+2,45+2,47+2,12+2,https://cdn.sofifa.net/players/262/760/22_120.png,https://cdn.sofifa.net/teams/111131/60.png,https://cdn.sofifa.net/flags/ie.png,,https://cdn.sofifa.net/flags/ie.png
19237,262820,https://sofifa.com/player/262820/luke-rudden/2...,L. Rudden,Luke Rudden,ST,47,60,110000.0,500.0,19,...,26+2,26+2,26+2,32+2,15+2,https://cdn.sofifa.net/players/262/820/22_120.png,https://cdn.sofifa.net/teams/111131/60.png,https://cdn.sofifa.net/flags/ie.png,,https://cdn.sofifa.net/flags/ie.png


In [3]:
# Selecting columns for analysis
df_processed = df_raw.loc[:, ["potential",
                              "value_eur",
                              "wage_eur", 
                              "age",
                              "height_cm",
                              "weight_kg",
                              "weak_foot",
                              "skill_moves", 
                              "pace",
                              "shooting",
                              "passing",
                              "dribbling",
                              "defending",
                              "physic",]]

# Dropping observations with missing values
df_processed = df_processed.dropna()

In [4]:
# Binning the target class 'potential' into 4 categories
df_processed['potential'] = pd.cut(x=df_processed['potential'], bins=[0, 67, 71, 75, 100], 
                     labels=['Low', 'Medium', 'Good', 'Great'])
df_processed

In [6]:
# Create the split
fifa_train, fifa_test = train_test_split(df_processed, test_size=0.3, random_state=123)

fifa_train.to_csv("data/processed/fifa_train.csv")
fifa_test.to_csv("data/processed/fifa_test.csv")

In [7]:
passthrough_feats = ["potential"]
numeric_feats = ['value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg',
       'weak_foot', 'skill_moves', 'pace', 'shooting', 'passing', 'dribbling',
       'defending', 'physic']

fifa_preprocessor = make_column_transformer(
    ("passthrough", passthrough_feats),
    (StandardScaler(), numeric_feats), 
)

fifa_preprocessor.fit(fifa_train)
scaled_fifa_train = fifa_preprocessor.transform(fifa_train)
scaled_fifa_test = fifa_preprocessor.transform(fifa_test)

column_names = (passthrough_feats + numeric_feats)
scaled_fifa_train = pd.DataFrame(scaled_fifa_train, columns=column_names)
scaled_fifa_test = pd.DataFrame(scaled_fifa_train, columns=column_names)

scaled_fifa_train.to_csv("data/processed/scaled_fifa_train.csv")
scaled_fifa_test.to_csv("data/processed/scaled_fifa_test.csv")


In [8]:
stack_order = ['Low', 'Medium', 'Good', 'Great']

alt.Chart(scaled_fifa_train).mark_area(opacity=0.5).encode(
    alt.X(alt.repeat()).type('quantitative').bin(maxbins=20),
    alt.Y('count()', stack=None),
    alt.Color('potential').sort(stack_order).title("Potential")
).properties(
    width = 150,
    height = 150
).repeat(
    numeric_feats,
    columns = 4
)

In [9]:
X_train = scaled_fifa_train.drop(columns = ['potential'])
y_train = scaled_fifa_train['potential']
X_test = scaled_fifa_test.drop(columns = ['potential'])
y_test = scaled_fifa_test['potential']

In [11]:
# BELOW THIS IS JUST COPY AND PASTED IN FROM THE LAB 4
# Looking for the best model
models = {
    "dummy": DummyClassifier(random_state=123), 
    "Decision Tree": DecisionTreeClassifier(random_state=123),
    "KNN": KNeighborsClassifier(),
    "RBF SVM": SVC(random_state=123),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000, multi_class="ovr", random_state=123),
}

In [12]:
# From Leture 2 of 571 at the below link
# https://pages.github.ubc.ca/MDS-2023-24/DSCI_571_sup-learn-1_students/lectures/02_ml-fundamentals.html?highlight=mean_std_cross_val_scores
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation
    """
    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [14]:
results = pd.DataFrame()
for name, model in models.items():
    results[name] = mean_std_cross_val_scores(model, X_train, y_train, return_train_score=True)
results

Unnamed: 0,dummy,Decision Tree,KNN,RBF SVM,Naive Bayes,Logistic Regression
fit_time,0.003 (+/- 0.000),0.039 (+/- 0.001),0.008 (+/- 0.000),1.162 (+/- 0.008),0.007 (+/- 0.000),0.059 (+/- 0.003)
score_time,0.001 (+/- 0.000),0.002 (+/- 0.000),0.125 (+/- 0.001),0.524 (+/- 0.007),0.002 (+/- 0.000),0.002 (+/- 0.000)
test_score,0.276 (+/- 0.000),0.822 (+/- 0.005),0.567 (+/- 0.005),0.751 (+/- 0.009),0.570 (+/- 0.007),0.705 (+/- 0.005)
train_score,0.276 (+/- 0.000),1.000 (+/- 0.000),0.728 (+/- 0.004),0.786 (+/- 0.002),0.570 (+/- 0.002),0.709 (+/- 0.002)


In [17]:
param_dist = {
    "svc__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "svc__gamma": [0.001, 0.01, 0.1, 1, 10, 100]
}

pipe = make_pipeline(SVC(random_state=123))

random_search = RandomizedSearchCV(pipe,
                           param_dist,
                           n_iter=36,
                           n_jobs=-1,
                           return_train_score=True,
                           random_state=123)
                                   

random_search.fit(X_train, y_train)

pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_score",
        "mean_train_score",        
        "param_svc__C",
        "param_svc__gamma",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index().iloc[:5]

Unnamed: 0_level_0,mean_test_score,mean_train_score,param_svc__C,param_svc__gamma,mean_fit_time
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.811787,0.836351,100,0.01,1.563921
2,0.779678,0.948357,100,0.1,2.954938
3,0.778169,0.876069,10,0.1,1.655168
4,0.772132,0.785127,10,0.01,1.519807
5,0.752095,0.759369,100,0.001,1.729349


In [18]:
pd.DataFrame(results["RBF SVM"])

Unnamed: 0,RBF SVM
fit_time,1.162 (+/- 0.008)
score_time,0.524 (+/- 0.007)
test_score,0.751 (+/- 0.009)
train_score,0.786 (+/- 0.002)


In [19]:
best_model = random_search.best_estimator_
best_model

In [20]:
best_model.score(X_test, y_test)

0.8374413145539906

## References