## Import Libraries

In [321]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score , classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics

%matplotlib inline

## Load the Data

In [322]:
df = pd.read_csv("./DF_Classification.csv")

In [323]:
df.shape

(3386, 19)

In [324]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value
0,8,Manchester United,Defender Centre-Back,186.0,21.0,34,0.0,0.0,0.130529,0.0,0.0,0.0,0.0,2758,47,13,0,2000000,2000000
1,10,Manchester United,Defender Left-Back,169.0,23.0,89,0.012619,0.063096,0.227145,0.0,0.0,0.0,0.0,7132,182,15,3,22000000,22000000


In [325]:
df.drop(columns='Unnamed: 0' , inplace = True)
# df.drop(columns='name' , inplace = True)

## Preprocess the Data

#### 1. Simple EDA + Data Quality checking

In [326]:
# uniqeness
df.drop_duplicates(inplace=True)
df[df.duplicated()]

Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value


In [327]:
# Completeness
df.isnull().sum()

team                   0
position               0
height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
dtype: int64

In [328]:
# Accurecy types
df.dtypes

team                    object
position                object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
dtype: object

In [329]:
numeric_col = [col for col in df.columns if df[col].dtype != "object"]
categorical_col = [col for col in df.columns if df[col].dtype == "object"]

In [330]:
for i in categorical_col:
    print(df[i].value_counts())

team
Southampton FC       21
AFC Bournemouth      20
AJ Auxerre           19
Orlando Pirates      19
Huddersfield Town    19
                     ..
Daegu FC              1
FC Seoul              1
Austin FC             1
Real Madrid           1
Kashima Antlers       1
Name: count, Length: 323, dtype: int64
position
Defender Centre-Back          783
midfield-CentralMidfield      466
Defender Right-Back           370
midfield-DefensiveMidfield    361
Defender Left-Back            345
Attack Centre-Forward         313
midfield-AttackingMidfield    235
Attack-RightWinger            227
Attack-LeftWinger             199
midfield-RightMidfield         35
midfield-LeftMidfield          30
Attack-SecondStriker           16
Goalkeeper                      3
Attack                          1
Defender                        1
midfield                        1
Name: count, dtype: int64


#### 2. Feature engineering

1. Feature scaling
2. Aggregation
3. One hot coding

In [331]:
df.head(2)

Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value
0,Manchester United,Defender Centre-Back,186.0,21.0,34,0.0,0.0,0.130529,0.0,0.0,0.0,0.0,2758,47,13,0,2000000,2000000
1,Manchester United,Defender Left-Back,169.0,23.0,89,0.012619,0.063096,0.227145,0.0,0.0,0.0,0.0,7132,182,15,3,22000000,22000000


In [332]:
categorical_col = ['team']

In [333]:
# one hot coding
df = pd.get_dummies(df, columns=categorical_col)

In [334]:
df.shape

(3386, 340)

In [335]:
encoder = LabelEncoder()
df['position'] = encoder.fit_transform(df['position'])  

#### 3. Feature selection

In [336]:
# sns.heatmap(df.corr(), annot=True);

In [337]:
correlation = df.corr()
correlation['position'].sort_values(ascending=False)

position                1.000000
yellow cards            0.191554
minutes played          0.068293
team_Spartak Moscow     0.045466
team_Bologna FC 1909    0.040503
                          ...   
team_Cádiz CF          -0.038223
age                    -0.047603
assists                -0.103474
height                 -0.134309
goals                  -0.414769
Name: position, Length: 340, dtype: float64

In [338]:
# Set the correlation threshold
threshold = 0.2  # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['position']) > threshold]['position'].index
selected_features

Index(['position', 'goals'], dtype='object')

#### 4. Prepare train and test data

In [339]:
# Prepare data
X = df.drop(['position'], axis=1)
y = df['position']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

# sacle the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [340]:
X.shape

(3386, 339)

## Buliding the Model

In [341]:
model_RF = RandomForestClassifier()
model_DT = DecisionTreeClassifier()

In [342]:
param_grid = {
    'max_depth': [4, 5, 6, 7, 10, 15],   # Regularization strength
    'n_estimators':[35, 40, 50, 60]
}
grid_search = GridSearchCV(estimator=model_RF,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro', 
                           verbose=1)

## Train the Model

In [343]:
model_RF.fit(X_train_scaled, y_train)

In [344]:
model_DT.fit(X_train_scaled, y_train)

In [345]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [346]:
grid_search.best_params_

{'max_depth': 15, 'n_estimators': 40}

In [347]:
best_model = grid_search.best_estimator_


## Test the Model

In [348]:
# Predict and evaluate the model
y_pred_DT = model_DT.predict(X_test)
y_pred_RF = model_RF.predict(X_test)
y_pred_RF_G = best_model.predict(X_test)

## Evaluating the Model 

In [349]:
# our benchmark model
base_model = round(df['position'].value_counts()[1]/df.shape[0]*100, 2)
base_model

9.24

1. Accuracy

In [350]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_DT)
accuracy*100

20.058997050147493

In [351]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_RF)
accuracy*100

15.486725663716813

In [352]:
accuracy = accuracy_score(y_test, y_pred_RF_G)
accuracy*100

40.707964601769916

In [353]:
y_pred_train_RF = model_RF.predict(X_train_scaled)
accuracy = accuracy_score(y_train, y_pred_train_RF)
accuracy*100

100.0

In [354]:
y_pred_test_RF = model_RF.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_test_RF)
accuracy*100

45.57522123893805

In [355]:
y_pred_train_DT = model_DT.predict(X_train_scaled)
accuracy = accuracy_score(y_train, y_pred_train_DT)
accuracy*100

100.0

In [356]:
y_pred_test_DT = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_test_DT)
accuracy*100

22.86135693215339

2. Precision

In [357]:
# Calculate Precision
precision = precision_score(y_test, y_pred_RF, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.14


In [358]:
# Calculate Precision
precision = precision_score(y_test, y_pred_DT, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.03


In [359]:
# Calculate Precision
precision = precision_score(y_test, y_pred_RF_G, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.20


In [360]:
y_pred_train_RF = model_RF.predict(X_train_scaled)
precision = precision_score(y_train, y_pred_train_RF, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 1.00


In [361]:
y_pred_test_RF = model_RF.predict(X_test_scaled)
precision = precision_score(y_test, y_pred_test_RF, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.26


In [362]:
y_pred_train_DT = model_DT.predict(X_train_scaled)
precision = precision_score(y_train, y_pred_train_DT, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 1.00


In [363]:
y_pred_test_DT = model_DT.predict(X_test_scaled)
precision = precision_score(y_test, y_pred_test_DT, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.21


In [364]:
y_pred_train_RF_G = best_model.predict(X_train_scaled)
precision = precision_score(y_train, y_pred_train_RF_G, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.25


In [365]:
y_pred_test_RF_G = best_model.predict(X_test_scaled)
precision = precision_score(y_test, y_pred_test_RF_G, average='macro')
print(f"Precision: {precision:.2f}")

Precision: 0.21


3. Recall

In [366]:
# Calculate Recall
recall = recall_score(y_test, y_pred_DT, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.08


In [367]:
# Calculate Recall
recall = recall_score(y_test, y_pred_RF, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.09


In [368]:
# Calculate Recall
recall = recall_score(y_test, y_pred_RF_G, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.22


In [369]:
y_pred_train_RF = model_RF.predict(X_train_scaled)
recall = recall_score(y_train, y_pred_train_RF, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 1.00


In [370]:
y_pred_test_RF = model_RF.predict(X_test_scaled)
recall = recall_score(y_test, y_pred_test_RF, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.27


In [371]:
y_pred_train_DT = model_DT.predict(X_train_scaled)
recall = recall_score(y_train, y_pred_train_DT, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 1.00


In [372]:
y_pred_test_DT = model_DT.predict(X_test_scaled)
recall = recall_score(y_test, y_pred_test_DT, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.21


In [373]:
y_pred_train_DT = best_model.predict(X_train_scaled)
recall = recall_score(y_train, y_pred_train_DT, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.21


In [374]:
y_pred_test_RF_G = best_model.predict(X_test_scaled)
recall = recall_score(y_test, y_pred_test_RF_G, average='macro')
print(f"Recall: {recall:.2f}")

Recall: 0.15


4. F1 Score

In [375]:
# Calculate F1 Score
f1 = f1_score(y_test, y_pred_DT, average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.05


In [376]:
# Calculate F1 Score
f1 = f1_score(y_test, y_pred_RF, average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.05


In [377]:
# Calculate F1 Score
f1 = f1_score(y_test, y_pred_RF_G, average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.19


In [378]:
y_pred_train_RF = model_RF.predict(X_train_scaled)
f1 = f1_score(y_train,y_pred_train_RF,average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 1.00


In [379]:
y_pred_test_RF = model_RF.predict(X_test_scaled)
f1 = f1_score(y_test,y_pred_test_RF,average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.25


In [380]:
y_pred_train_DT = model_DT.predict(X_train_scaled)
f1 = f1_score(y_train,y_pred_train_DT,average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 1.00


In [381]:
y_pred_test_DT = model_DT.predict(X_test_scaled)
f1 = f1_score(y_test,y_pred_test_DT,average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.21


In [382]:
y_pred_train_RF_G = best_model.predict(X_train_scaled)
f1 = f1_score(y_train,y_pred_train_RF_G,average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.19


In [383]:
y_pred_test_RF_G = best_model.predict(X_test_scaled)
f1 = f1_score(y_test,y_pred_test_RF_G,average='macro')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.13


In [384]:
print(classification_report(y_test,
                      y_pred_test_RF,
                      labels=list(model_RF.classes_)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.59      0.86      0.70        64
           2       0.24      0.16      0.19        37
           3       0.32      0.22      0.26        50
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.64      0.95      0.76       168
           7       0.28      0.23      0.26        73
           8       0.27      0.25      0.26        72
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.15      0.07      0.10        43
          12       0.29      0.33      0.31        92
          13       0.34      0.15      0.21        66
          14       0.00      0.00      0.00         5
          15       0.00      0.00      0.00         6

    accuracy                           0.46       678
   macro avg       0.20   

In [385]:
print(classification_report(y_test,
                      y_pred_test_DT,
                      labels=list(model_RF.classes_)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.49      0.41      0.44        64
           2       0.11      0.11      0.11        37
           3       0.14      0.12      0.13        50
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.68      0.70      0.69       168
           7       0.23      0.18      0.20        73
           8       0.18      0.18      0.18        72
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.12      0.14      0.13        43
          12       0.22      0.22      0.22        92
          13       0.23      0.30      0.26        66
          14       0.12      0.20      0.15         5
          15       0.00      0.00      0.00         6

    accuracy                           0.33       678
   macro avg       0.16   

In [386]:
print(classification_report(y_test,
                      y_pred_test_RF_G,
                      labels=list(best_model.classes_)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.28      0.16      0.20        64
           2       0.13      0.32      0.19        37
           3       0.00      0.00      0.00        50
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.55      0.42      0.48       168
           7       0.17      0.41      0.24        73
           8       0.13      0.21      0.16        72
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.11      0.12      0.11        43
          12       0.15      0.12      0.13        92
          13       1.00      0.02      0.03        66
          14       0.00      0.00      0.00         5
          15       0.00      0.00      0.00         6

    accuracy                           0.23       678
   macro avg       0.16   