## Import Libraries

In [59]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import dtale as dt
from sklearn import metrics

%matplotlib inline

## Load the Data

In [55]:
df = pd.read_csv("final_data.csv")

In [58]:
dt.show(df)



In [36]:
df.shape

(10754, 22)

In [37]:
df.head(2)

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,...,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,/david-de-gea/profil/spieler/59377,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,...,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,/jack-butland/profil/spieler/128899,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,...,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0


## Preprocess the Data

#### 1. Simple EDA + Data Quality checking

In [38]:
# uniqeness
df.drop_duplicates(inplace=True)
df[df.duplicated()]

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,...,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger


In [39]:
# Completeness
df.isnull().sum()

player                 0
team                   0
name                   0
position               0
height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
position_encoded       0
winger                 0
dtype: int64

In [40]:
# Accurecy types
df.dtypes

player                  object
team                    object
name                    object
position                object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
position_encoded         int64
winger                   int64
dtype: object

In [41]:

categorical_features = ['height',
                        'age',
                        'appearance',
                        'goals',
                        'assists', 
                        'ellow cards' ,
                        'second yellow cards',
                        'red cards ',
                        'goals' ,
                        'conceded',
                         'clean',
                         'sheets', 
                        'minutes', 
                        'played', 
                        'days_injured',
                         'games_injured' ,
                        'current_value ',
                        'highest_value ', 
                        'position_encoded',
                         'winger ' ]
numeric_features = ['player','team ','name','position ']

In [42]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


In [43]:
df = pd.read_csv("final_data.csv")

In [44]:
KNeighbors_Classifier = KNeighborsClassifier() 

In [45]:
# correlation_matrix = df.corr()

# # Extract the correlations with the 'highest_value' column
# correlations_with_highest_value = correlation_matrix["highest_value"]

# # Find the column with the highest correlation to 'highest_value', excluding itself
# highest_corr_column = correlations_with_highest_value.drop("highest_value").idxmax()
# highest_corr_value = correlations_with_highest_value.drop("highest_value").max()

# print("Column with highest correlation to 'highest_value':", highest_corr_column)
# print("Highest correlation value:", highest_corr_value)

In [46]:

thresholds = {
    'high price': df['highest_value'].quantile(0.75),
    'good price': df['highest_value'].quantile(0.50),
    'cheap price': df['highest_value'].quantile(0.25)
}

# Function to categorize values
def categorize_value(value):
    if value >= thresholds['high price']:
        return 'high price'
    elif value >= thresholds['good price']:
        return 'good price'
    else:
        return 'cheap price'

# Apply the categorize_value function to create the 'price_category' column
df['price_category'] = df['highest_value'].apply(categorize_value)

# Map categories to numerical values for encoding
df['price_category_encoded'] = df['price_category'].map({
    'cheap price': 0,
    'good price': 1,
    'high price': 2
})

# Define the features and target variable
features = ['appearance','minutes played', 'award', 'current_value']
X = df[features]
y = df['price_category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)






In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=.2)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [48]:
param_grid = {
    'n_neighbors':[1,2,5,6,7,9]
    
}


# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=KNeighbors_Classifier,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro',
                           verbose=1,
                           n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)


best_params = grid_search.best_params_
best_svm_classifier = grid_search.best_estimator_



Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [49]:
y_pred=best_svm_classifier.predict(X_test_scaled)

## knn model

In [50]:


accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
# print(f"Best Parameters: {best_params}")
print(f"Accuracy: {accuracy}")
print(f"F1 Macro: {f1_macro}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.7773128777312878
F1 Macro: 0.7331170648209256
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.94      0.87      1070
           1       0.60      0.53      0.56       505
           2       0.86      0.70      0.77       576

    accuracy                           0.78      2151
   macro avg       0.76      0.72      0.73      2151
weighted avg       0.77      0.78      0.77      2151

Confusion Matrix:
[[1004   60    6]
 [ 178  266   61]
 [  53  121  402]]


In [51]:
import joblib
joblib.dump(best_svm_classifier, 'Models/ML - knn.joblib')
joblib.dump(scaler, 'Models/scaler.joblib')

['Models/scaler.joblib']