In [1]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import math
import dtale
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
df = pd.read_csv('final_data.csv')

In [3]:
dtale.show(df, open_browser=True)



In [4]:
df.shape

(10754, 22)

In [5]:
df.head(2)

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,...,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,/david-de-gea/profil/spieler/59377,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,...,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,/jack-butland/profil/spieler/128899,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,...,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0


In [6]:
# uniqeness
df.duplicated().sum()

0

In [7]:
# Completeness
df.isnull().sum()

player                 0
team                   0
name                   0
position               0
height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
position_encoded       0
winger                 0
dtype: int64

In [8]:
# Accurecy types
df.dtypes

player                  object
team                    object
name                    object
position                object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
position_encoded         int64
winger                   int64
dtype: object

In [9]:
# drop column player we dont need it 
# drop column position becuse we have position_encoded
df.drop('player',axis=1,inplace = True) 
df.drop('position',axis=1,inplace = True)
df.drop('name',axis=1,inplace= True) 
df.dtypes

team                    object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
position_encoded         int64
winger                   int64
dtype: object

In [10]:
# Accurecy - Outlier

# Convert categorical variables using OneHotEncoding
# position is alrady encoded

categorical_features = ['team']
numeric_features = ['height', 'age', 'appearance','goals', 'assists', 'yellow cards', 'second yellow cards', 'red cards',
       'goals conceded', 'clean sheets', 'minutes played', 'days_injured',
       'games_injured', 'award', 'current_value', 'highest_value',
       'position_encoded', 'winger']

In [11]:
for i in categorical_features:
    print (df[i].value_counts())

team
Daejeon Hana Citizen      46
Jeonbuk Hyundai Motors    46
FC Seoul                  45
Gangwon FC                43
Daegu FC                  42
                          ..
FC Barcelona              22
Atlético de Madrid        22
CA Osasuna                22
Hatayspor                 12
Gaziantep FK               6
Name: count, Length: 374, dtype: int64


In [12]:
for i in numeric_features:
    print(df[i].value_counts())

height
180.000000    744
185.000000    608
175.000000    596
178.000000    594
183.000000    572
188.000000    493
184.000000    458
182.000000    449
181.240353    440
186.000000    423
187.000000    419
176.000000    390
177.000000    372
190.000000    371
173.000000    366
179.000000    359
181.000000    355
170.000000    312
191.000000    272
174.000000    270
189.000000    255
172.000000    254
192.000000    203
193.000000    181
171.000000    169
168.000000    124
194.000000    117
169.000000    107
195.000000     98
167.000000     85
196.000000     68
166.000000     44
165.000000     42
197.000000     31
198.000000     24
163.000000     16
160.000000     15
164.000000     14
199.000000     12
200.000000     12
202.000000      6
162.000000      6
161.000000      3
206.000000      2
204.000000      1
156.000000      1
159.000000      1
Name: count, dtype: int64
age
22.000000    841
23.000000    823
26.000000    812
25.000000    772
24.000000    767
21.000000    715
28.000000    67

In [13]:
df = df[df['age']<40]
df = df[df["appearance"]<100]
df = df[df["goals"]>0.5]
df = df[df['award']<5]
df = df[df["days_injured"]<407]
df = df[df['games_injured']<60]
df = df[df["minutes played"]<8000]

In [14]:
df.head(2)

Unnamed: 0,team,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
26,Manchester United,180.0,18.0,72,0.52573,0.300417,0.225313,0.025035,0.0,0.0,0.0,3595,50,11,2,25000000,25000000,4,1
161,Southampton FC,183.0,20.0,58,0.542714,0.226131,0.180905,0.0,0.0,0.0,0.0,1990,0,0,0,12000000,12000000,4,0


In [15]:
# one hot coding
df = pd.get_dummies(df, columns=categorical_features)

In [16]:
print(df.columns.tolist()) 

['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards', 'second yellow cards', 'red cards', 'goals conceded', 'clean sheets', 'minutes played', 'days_injured', 'games_injured', 'award', 'current_value', 'highest_value', 'position_encoded', 'winger', 'team_1.FC Köln', 'team_1.FC Union Berlin', 'team_1.FSV Mainz 05', 'team_AA Argentinos Juniors', 'team_AC Ajaccio', 'team_ACF Fiorentina', 'team_AJ Auxerre', 'team_AS Monaco', 'team_AZ Alkmaar', 'team_Adana Demirspor', 'team_Adelaide United', 'team_Ajax Amsterdam', 'team_Akhmat Grozny', 'team_Al-Adalah FC', 'team_Al-Fateh SC', 'team_Al-Hilal Saudi FC', 'team_Al-Ittihad Club (Jeddah)', 'team_Al-Nassr FC', 'team_Al-Raed SFC', 'team_Al-Shabab Club (Riyadh)', 'team_Al-Taawoun FC', 'team_Al-Tai', 'team_Al-Wehda FC', 'team_Alanyaspor', 'team_Albirex Niigata', 'team_Angers SCO', 'team_Antalyaspor', 'team_Arsenal FC', 'team_Atalanta BC', 'team_Atlanta United FC', 'team_Atlético de San Luis', 'team_Austin FC', 'team_Avispa Fukuoka', 'tea

In [17]:
# Calculate the 35th and 75th percentiles of the current_value
p35 = df['current_value'].quantile(0.35)
p75 = df['current_value'].quantile(0.75)

# Function to categorize current_values
def categorize_current_value(current_value):
    if current_value < p35:
        return 'cv_low'
    elif current_value < p75:
        return 'cv_good'
    else:
        return 'cv_high'

# Apply the function to create a new column
df['players_current_value_category'] = df['current_value'].apply(categorize_current_value)

df.drop('current_value', axis=1, inplace=True)

#Verify the distribution of the new categories
print(df['players_current_value_category'].value_counts())

players_current_value_category
cv_good    147
cv_low     123
cv_high     94
Name: count, dtype: int64


In [18]:
p35 ,p75

(400000.0, 3500000.0)

In [19]:
df['players_current_value_category'] 

26       cv_high
161      cv_high
184      cv_high
186      cv_high
242      cv_high
          ...   
10667     cv_low
10669     cv_low
10720     cv_low
10725     cv_low
10743     cv_low
Name: players_current_value_category, Length: 364, dtype: object

In [20]:
encoder = LabelEncoder()
#pcvc_encoded == players current value category encoded
df['pcvc_encoded'] = encoder.fit_transform(df['players_current_value_category'])  

In [21]:
df.drop('players_current_value_category',axis=1,inplace=True)

In [22]:
corrl = df.corr(numeric_only=True)
#print(corrl['current_value'].sort_values(ascending=False))

print(corrl['pcvc_encoded'].sort_values(ascending=False))

pcvc_encoded                   1.000000
team_Jeonbuk Hyundai Motors    0.130844
goals                          0.130619
team_Cerezo Osaka              0.127226
team_Gangwon FC                0.113157
                                 ...   
minutes played                -0.202431
appearance                    -0.221586
age                           -0.315197
goals conceded                      NaN
clean sheets                        NaN
Name: pcvc_encoded, Length: 250, dtype: float64


In [23]:
# Set the correlation threshold
threshold = 0.1

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = corrl[abs(corrl['pcvc_encoded']) > threshold]['pcvc_encoded'].index
selected_features

Index(['age', 'appearance', 'goals', 'yellow cards', 'minutes played',
       'days_injured', 'games_injured', 'award', 'position_encoded',
       'team_Cerezo Osaka', 'team_Gangwon FC', 'team_Jeonbuk Hyundai Motors',
       'team_Kashiwa Reysol', 'team_Melbourne City FC', 'pcvc_encoded'],
      dtype='object')

In [24]:
selected_features = ['age', 'appearance', 'goals', 'yellow cards', 'minutes played',
       'days_injured', 'games_injured', 'award', 'position_encoded',
       'team_Cerezo Osaka', 'team_Gangwon FC', 'team_Jeonbuk Hyundai Motors',
       'team_Kashiwa Reysol', 'team_Melbourne City FC', 'pcvc_encoded']

In [25]:
df = df[selected_features]
df.head()

Unnamed: 0,age,appearance,goals,yellow cards,minutes played,days_injured,games_injured,award,position_encoded,team_Cerezo Osaka,team_Gangwon FC,team_Jeonbuk Hyundai Motors,team_Kashiwa Reysol,team_Melbourne City FC,pcvc_encoded
26,18.0,72,0.52573,0.225313,3595,50,11,2,4,False,False,False,False,False,1
161,20.0,58,0.542714,0.180905,1990,0,0,0,4,False,False,False,False,False,1
184,18.0,58,0.666477,0.051267,3511,0,0,0,4,False,False,False,False,False,1
186,26.0,71,0.716887,0.225307,4394,96,14,3,4,False,False,False,False,False,1
242,25.0,73,0.62528,0.121022,4462,261,26,3,4,False,False,False,False,False,1


In [26]:
# Prepare data
X = df.drop(['pcvc_encoded'], axis=1)
y = df['pcvc_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

# sacle the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
X.shape

(364, 14)

In [None]:
model = KNeighborsClassifier()

In [None]:
param_grid = {
    'n_neighbors': [7, 8, 9, 10, 11]
}
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro', 
                           verbose=1)

In [None]:
# Fit the model on the training data
grid_search.fit(X_train_scaled, y_train)

In [None]:
grid_search.best_params_

{'C': 1.1, 'gamma': 0.05, 'kernel': 'sigmoid'}

In [None]:
model = grid_search.best_estimator_

In [None]:
# Predict and evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:
# our benchmark model
base_model = round(df['pcvc_encoded'].value_counts()[1]/df.shape[0]*100, 2)
base_model

25.82

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5616438356164384

In [None]:
y_pred_train = model.predict(X_train_scaled)
accuracy = accuracy_score(y_train, y_pred_train)
accuracy

0.6082474226804123

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[13,  7,  9],
       [ 8, 10,  4],
       [ 4,  0, 18]], dtype=int64)

In [None]:
# Calculate Precision
precision = precision_score(y_test, y_pred, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.56


In [None]:
# Calculate Recall
recall = recall_score(y_test, y_pred, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.57


In [None]:
# Calculate F1 Score
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.56
