## Import Libraries

In [220]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score , classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics

%matplotlib inline

## Load the Data

In [221]:
df = pd.read_csv("./Lower player.csv")

In [222]:
df.shape

(5912, 19)

In [223]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value
0,1,Manchester United,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000
1,2,Manchester United,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,6000000


In [224]:
df.drop(columns='Unnamed: 0' , inplace = True)

## Preprocess the Data

#### 1. Simple EDA + Data Quality checking

In [225]:
# uniqeness
df.drop_duplicates(inplace=True)
df[df.duplicated()]

Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value


In [226]:
# Completeness
df.isnull().sum()

team                   0
position               0
height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
dtype: int64

In [227]:
numeric_col = [col for col in df.columns if df[col].dtype != "object"]
categorical_col = [col for col in df.columns if df[col].dtype == "object"]

In [228]:
# for i in categorical_col:
#     print(df[i].value_counts())

In [229]:
# Calculate the 35th and 75th percentiles of the price
p35 = df['current_value'].quantile(0.35)
p75 = df['current_value'].quantile(0.75)

# Function to categorize prices
def categorize_price(price):
    if price < p35:
        return 'Cheap_Price'
    elif price < p75:
        return 'Good_Price'
    else:
        return 'High_Price'

# Apply the function to create a new column
df['sale_price_category'] = df['current_value'].apply(categorize_price)

df.drop('current_value', axis=1, inplace=True)

# Verify the distribution of the new categories
print(df['sale_price_category'].value_counts())

sale_price_category
Good_Price     2255
Cheap_Price    1946
High_Price     1711
Name: count, dtype: int64


#### 2. Feature engineering

1. Feature scaling
2. Aggregation
3. One hot coding

In [230]:
df.head(2)

Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,highest_value,sale_price_category
0,Manchester United,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,0.0,1.242331,0.207055,1304,510,58,1,22000000,High_Price
1,Manchester United,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,6000000,Good_Price


In [231]:
categorical_col = ['team' , 'position']

In [232]:
# one hot coding
df = pd.get_dummies(df, columns=categorical_col)

In [233]:
df.shape

(5912, 403)

In [234]:
encoder = LabelEncoder()
df['sale_price_category'] = encoder.fit_transform(df['sale_price_category'])  

#### 3. Feature selection

In [235]:
correlation = df.corr()
correlation['sale_price_category'].sort_values(ascending=False)

sale_price_category    1.000000
highest_value          0.381909
minutes played         0.306681
appearance             0.305590
games_injured          0.233946
                         ...   
position_Defender     -0.090184
position_Attack       -0.115159
position_midfield     -0.133761
goals conceded        -0.143927
position_Goalkeeper   -0.209644
Name: sale_price_category, Length: 403, dtype: float64

In [236]:
# Set the correlation threshold
threshold = 0.2  # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['sale_price_category']) > threshold]['sale_price_category'].index
selected_features

Index(['appearance', 'minutes played', 'days_injured', 'games_injured',
       'award', 'highest_value', 'sale_price_category', 'position_Goalkeeper'],
      dtype='object')

In [237]:
df = df[selected_features]

#### 4. Prepare train and test data

In [238]:
# Prepare data
X = df.drop(['sale_price_category'], axis=1)
y = df['sale_price_category']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42)

# # sacle the data
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [239]:
X.shape

(5912, 7)

## Buliding the Model

In [240]:
model_RF = RandomForestClassifier()
model_DT = DecisionTreeClassifier()

In [241]:
param_grid = {
    'max_depth': [4, 5, 6, 7, 10, 15],   # Regularization strength
    'n_estimators':[35, 40, 50, 60]
}
grid_search = GridSearchCV(estimator=model_RF,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro', 
                           verbose=1)

## Train the Model

In [242]:
model_RF.fit(X_train, y_train)

In [243]:
model_DT.fit(X_train, y_train)

In [244]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [245]:
grid_search.best_params_

{'max_depth': 7, 'n_estimators': 50}

In [246]:
best_model = grid_search.best_estimator_


## Test the Model

In [247]:
# Predict and evaluate the model
y_pred_DT = model_DT.predict(X_test)
y_pred_RF = model_RF.predict(X_test)
y_pred_RF_G = best_model.predict(X_test)

## Evaluating the Model 

In [248]:
# our benchmark model
base_model = round(df['sale_price_category'].value_counts()[1]/df.shape[0]*100, 2)
base_model

38.14

1. Accuracy

In [249]:
# Evaluate the model
print("DecisionTreeClassifier")

accuracy_test_DT = accuracy_score(y_test, y_pred_DT)
print('Test Accuracy for DT' ,accuracy_test_DT*100)

y_pred_train_DT = model_DT.predict(X_train)
accuracy_train_DT = accuracy_score(y_train, y_pred_train_DT)
print('Train Accuracy for DT' ,accuracy_train_DT*100)

DecisionTreeClassifier
Test Accuracy for DT 74.63359639233371
Train Accuracy for DT 99.9275012083132


In [250]:
# Evaluate the model
print('RandomForestClassifier')

accuracy_test_RF = accuracy_score(y_test, y_pred_RF)
print('Test Accuracy for DT' ,accuracy_test_RF*100)

y_pred_train_RF = model_RF.predict(X_train)
accuracy_train_RF = accuracy_score(y_train, y_pred_train_RF)
print('Train Accuracy for DT' ,accuracy_train_RF*100)

RandomForestClassifier
Test Accuracy for DT 79.65050732807215
Train Accuracy for DT 99.9275012083132


In [251]:
print('GridSearchCV for RandomForestClassifier')

accuracy_test_RF_G = accuracy_score(y_test, y_pred_RF_G)
print('Test Accuracy for DT' ,accuracy_test_RF_G*100)

y_pred_train_RF_G = best_model.predict(X_train)
accuracy_train_RF_G = accuracy_score(y_train, y_pred_train_RF_G)
print('Train Accuracy for DT' ,accuracy_train_RF_G*100)

GridSearchCV for RandomForestClassifier
Test Accuracy for DT 81.0033821871477
Train Accuracy for DT 84.50942484291929


2. Precision

In [252]:
# Calculate Precision
print("DecisionTreeClassifier")

precision_test_DT = precision_score(y_test, y_pred_DT, average='macro')
print(f"Test Precision: {precision_test_DT:.2f}")

y_pred_train_RF = model_RF.predict(X_train)
precision_train_DT = precision_score(y_train, y_pred_train_RF, average='macro')
print(f"Train Precision: {precision_train_DT:.2f}")

DecisionTreeClassifier
Test Precision: 0.75
Train Precision: 1.00


In [253]:
# Calculate Precision
print("RandomForestClassifier")

precision_test_RF = precision_score(y_test, y_pred_RF, average='macro')
print(f"Test Precision: {precision_test_RF:.2f}")

y_pred_train_RF = model_RF.predict(X_train)
precision_train_RF = precision_score(y_train, y_pred_train_RF, average='macro')
print(f"Train Precision: {precision_train_RF:.2f}")

RandomForestClassifier
Test Precision: 0.81
Train Precision: 1.00


In [254]:
# Calculate Precision
print('GridSearchCV for RandomForestClassifier')

precision_test_RF_G = precision_score(y_test, y_pred_RF_G, average='macro')
print(f"Test Precision: {precision_test_RF_G:.2f}")

y_pred_train_RF_G = best_model.predict(X_train)
precision_train_RF_G = precision_score(y_train, y_pred_train_RF_G, average='macro')
print(f"Train Precision: {precision_train_RF_G:.2f}")

GridSearchCV for RandomForestClassifier
Test Precision: 0.83
Train Precision: 0.86


3. Recall

In [255]:
# Calculate Recall
print("DecisionTreeClassifier")

recall_test_DT = recall_score(y_test, y_pred_DT, average='macro')
print(f"Recall: {recall_test_DT:.2f}")

y_pred_train_DT = model_DT.predict(X_train)
recall_train_DT = recall_score(y_train, y_pred_train_DT, average='macro')
print(f"Recall: {recall_train_DT:.2f}")

DecisionTreeClassifier
Recall: 0.75
Recall: 1.00


In [256]:
# Calculate Recall
print("RandomForestClassifier")

recall_test_RF = recall_score(y_test, y_pred_RF, average='macro')
print(f"Test Recall: {recall_test_RF:.2f}")

y_pred_train_RF = model_RF.predict(X_train)
recall_train_RF = recall_score(y_train, y_pred_train_RF, average='macro')
print(f"Train Recall: {recall_train_RF:.2f}")

RandomForestClassifier
Test Recall: 0.80
Train Recall: 1.00


In [257]:
# Calculate Recall
print('GridSearchCV for RandomForestClassifier')

recall_test_RF_G = recall_score(y_test, y_pred_RF_G, average='macro')
print(f"Test Recall: {recall_test_RF_G:.2f}")

y_pred_train_RF_G = best_model.predict(X_train)
recall_train_RF_G = recall_score(y_train, y_pred_train_RF_G, average='macro')
print(f"Train Recall: {recall_train_RF_G:.2f}")

GridSearchCV for RandomForestClassifier
Test Recall: 0.82
Train Recall: 0.85


4. F1 Score

In [258]:
# Calculate F1 Score
print("DecisionTreeClassifier")

f1_test_DT = f1_score(y_test, y_pred_DT, average='macro')
print(f"Test F1 Score: {f1_test_DT:.2f}")

y_pred_train_DT = model_DT.predict(X_train)
f1_train_DT = f1_score(y_train,y_pred_train_DT,average='macro')
print(f"Train F1 Score: {f1_train_DT:.2f}")

DecisionTreeClassifier
Test F1 Score: 0.75
Train F1 Score: 1.00


In [259]:
# Calculate F1 Score
print("RandomForestClassifier")

f1_test_RF = f1_score(y_test, y_pred_RF, average='macro')
print(f"Test F1 Score: {f1_test_RF:.2f}")

y_pred_train_RF = model_RF.predict(X_train)
f1_train_RF = f1_score(y_train,y_pred_train_RF,average='macro')
print(f"Train F1 Score: {f1_train_RF:.2f}")

RandomForestClassifier
Test F1 Score: 0.80
Train F1 Score: 1.00


In [260]:
# Calculate F1 Score
print('GridSearchCV for RandomForestClassifier')

f1_test_RF_G = f1_score(y_test, y_pred_RF_G, average='macro')
print(f"Test F1 Score: {f1_test_RF_G:.2f}")

y_pred_train_RF_G = best_model.predict(X_train)
f1_train_RF_G = f1_score(y_train,y_pred_train_RF_G,average='macro')
print(f"Train F1 Score: {f1_train_RF_G:.2f}")

GridSearchCV for RandomForestClassifier
Test F1 Score: 0.81
Train F1 Score: 0.85
