# Predicting high-potential FIFA players using individual performance data
Merete Lutz, Jake Barnabe, Simon Frew, Waleed Mahmood

DSCI 522, Group 17

## Introduction

## Methods

### Data

### Analysis

## Results & Discussion

In [1]:
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)


import os 
import requests
import warnings
import zipfile

import numpy as np
import pandas as pd
import altair as alt


from hashlib import sha1
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

warnings.filterwarnings('ignore', category=FutureWarning)
alt.data_transformers.enable("vegafusion");
pd.set_option('display.max_rows', 200)

ModuleNotFoundError: No module named 'altair'

In [None]:
# download dataset
# method adapted from: https://github.com/ttimbers/breast_cancer_predictor_py/tree/0.0.1
url = "https://sports-statistics.com/database/fifa/fifa_2022_datasets.zip"

request = requests.get(url)

with open(os.path.join("data", "fifa_2022_datasets.zip"), "wb") as f: 
    f.write(request.content)
with zipfile.ZipFile(os.path.join("data", "fifa_2022_datasets.zip"), "r") as zip_file: 
    zip_file.extract("players_22.csv", path="data")
os.remove(os.path.join("data", "fifa_2022_datasets.zip"))

In [None]:
df

In [None]:
# data selection
df = pd.read_csv("data/players_22.csv", encoding="utf-8", low_memory=False)

df

In [None]:
df = df.loc[:, ["potential",
                #"player_positions",
                "value_eur",
        "wage_eur", 
        "age", 
        "height_cm", 
        "weight_kg", 
        "weak_foot", 
        "skill_moves", 
        'pace',
        'shooting',
        'passing',
        'dribbling',
        'defending',
        'physic',]]

df = df.dropna()

In [None]:
df.describe()

In [None]:
df['target'] = pd.cut(x=df['potential'], bins=[0, 67, 71, 75, 100], 
                     labels=['Low', 'Medium', 'Good', 'Great'])

In [None]:
df

In [None]:
df = df.drop(['potential'], axis=1)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=123)
train_df

In [None]:
df_numeric = df.select_dtypes(include = ['int64', 'float64'])
df_numeric_list = df_numeric.columns
print(df_numeric_list)
# initialize an empty list to store the distribution of each numeric column
charts_numeric = []

# create a visualization of the distribution of each numeric column and store it in the `charts` list
for column in df_numeric_list:
    chart = alt.Chart(df).mark_bar().encode(
        alt.X(column, bin=alt.Bin(maxbins=20), title=f'{column}'),
        alt.Y('count()', title='Frequency')
    ).properties(
        width=150,
        height=100,
        title=f'Distribution of {column}'
    )
    charts_numeric.append(chart)
length = len(charts_numeric)
# # Create a 4x4 grid with all the distributions
# alt.vconcat(
#     alt.hconcat(*charts_numeric[:round(length/4)]),
#     alt.hconcat(*charts_numeric[round(length/4):round(length/2)]),
#     alt.hconcat(*charts_numeric[round(length/2):round(3*length/4)]),
#     alt.hconcat(*charts_numeric[round(3*length/4):length-1])
# ).configure_title(fontSize=10).configure_axis(titleFontSize=8)

alt.vconcat(
    alt.hconcat(*charts_numeric[:4]),
    alt.hconcat(*charts_numeric[4:8]),
    alt.hconcat(*charts_numeric[8:12]),
    alt.hconcat(*charts_numeric[12:16])
).configure_title(fontSize=10).configure_axis(titleFontSize=8)

In [None]:
X_train = train_df.drop(columns = ['target'])
y_train = train_df['target']
X_test = test_df.drop(columns = ['target'])
y_test = test_df['target']

In [None]:
# BELOW THIS IS JUST COPY AND PASTED IN FROM THE LAB 4

In [None]:
models = {
    "dummy": DummyClassifier(random_state=123), 
    "Decision Tree": DecisionTreeClassifier(random_state=123),
    "KNN": KNeighborsClassifier(),
    "RBF SVM": SVC(random_state=123),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000, multi_class="ovr", random_state=123),
}

In [None]:
# From Leture 2 of 571 at the below link
# https://pages.github.ubc.ca/MDS-2023-24/DSCI_571_sup-learn-1_students/lectures/02_ml-fundamentals.html?highlight=mean_std_cross_val_scores
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation
    """
    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
results = pd.DataFrame()
for name, model in models.items():
    pipe = make_pipeline(StandardScaler(),
                         model)
    results[name] = mean_std_cross_val_scores(pipe, X_train, y_train, return_train_score=True)

In [None]:
results

In [None]:
from scipy.stats import loguniform, randint, uniform
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "svc__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "svc__gamma": [0.001, 0.01, 0.1, 1, 10, 100]
}
pipe = make_pipeline(StandardScaler(),
                     SVC(random_state=123))

random_search = RandomizedSearchCV(pipe,
                                   param_dist, 
                                   n_iter=10, 
                                   n_jobs=-1, 
                                   return_train_score=True,
                                   random_state=123)
                                   

random_search.fit(X_train, y_train)

pd.DataFrame(random_search.cv_results_)[
    [
        "mean_test_score",
        "mean_train_score",        
        "param_svc__C",
        "param_svc__gamma",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index().iloc[:5]

In [None]:
results

In [None]:
best_model = random_search.best_estimator_
best_model

In [None]:
best_model.score(X_test, y_test)

## References