## Import Libraries

In [None]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics

%matplotlib inline

## Load the Data

In [None]:
df = pd.read_csv("final_data.csv")

In [None]:
#d = dtale.show(df)
#d.open_browser()

In [None]:
df.shape

In [None]:
df.head()

## Preprocess the Data

#### 1. Simple EDA + Data Quality checking

In [None]:
# uniqeness
df.drop_duplicates(inplace=True)
df[df.duplicated()]

In [None]:
# Accurecy - Outlier
# drop unneccsery columns 
df.drop(columns=['player'], inplace=True)
df.drop(columns=['name'], inplace=True)
df.drop(columns=['winger'], inplace=True)

In [None]:
# Drop rows 
df.drop(df[(df['age'] <= 18) | (df['age'] >= 40)].index, inplace=True)
df.drop(df[(df['current_value'] == 0) | (df['highest_value'] == 0)].index, inplace=True)
df.drop(df[(df['appearance'] == 0) | (df['appearance'] == 95)].index, inplace=True)
df.drop(df[df['current_value'] >= 7000000].index, inplace=True)
df.drop(df[df['current_value'] < 10000].index, inplace=True)
df.drop(df[df['highest_value'] >= 8500000].index, inplace=True)
df.drop(df[df['highest_value'] < 250000].index, inplace=True)
df.drop(df[df['height'] > 195].index, inplace=True)
df.drop(df[df['height'] <= 160].index, inplace=True)
#df.drop(df[(df['age'] == 26) & (df['award'] == 0)].index, inplace=True)

df.shape

In [None]:
# Completeness
df.isnull().sum()

In [None]:
# Accurecy types
df.dtypes

In [None]:
df.columns

In [None]:
# Accurecy - Outlier

# Convert categorical variables using OneHotEncoding
categorical_features = ['team', 'position']
numeric_features = ['age', 'appearance', 'goals', 'assists',
       'yellow cards', 'second yellow cards', 'red cards', 'goals conceded',
       'clean sheets', 'minutes played', 'days_injured', 'games_injured',
       'award', 'current_value', 'highest_value', 'position_encoded']

In [None]:
df.drop(df[(df['age'] <= 18) | (df['age'] >= 40)].index, inplace=True)
df.drop(df[(df['current_value'] == 0) | (df['highest_value'] == 0)].index, inplace=True)
df.drop(df[(df['appearance'] == 0) | (df['appearance'] == 95)].index, inplace=True)
df.drop(df[df['current_value'] >= 7000000].index, inplace=True)
df.drop(df[df['current_value'] < 10000].index, inplace=True)
df.drop(df[df['highest_value'] >= 8500000].index, inplace=True)
df.drop(df[df['highest_value'] < 250000].index, inplace=True)
df.drop(df[df['height'] > 195].index, inplace=True)
df.drop(df[df['height'] <= 160].index, inplace=True)

In [None]:
for i in categorical_features:
    print (df[i].value_counts())

In [None]:
# Calculate the frequency of each 'Type'
type_counts = df['position'].value_counts()
type_counts
# Filter 'Type' values that appear 10 or more times

In [None]:
df.columns

In [None]:
df.shape

In [None]:
#df[df['Type']=='Other']

In [None]:
# Count the occurrences of each position
position_counts = df['position'].value_counts()

# Plot
plt.figure(figsize=(10, 6))
position_counts.plot(kind='bar', color='coral')
plt.xlabel('position')
plt.ylabel('Count')
plt.title('Count of Each Position')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# Count the occurrences of each position
position_counts = df['position_encoded'].value_counts()

# Plot
plt.figure(figsize=(10, 6))
position_counts.plot(kind='bar', color='coral')
plt.xlabel('position_encoded')
plt.ylabel('Count')
plt.title('Count of Each Position')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
df.shape

In [None]:
plt.figure(figsize=(15, 8))
plt.hist(df['age'], bins=100, color='skyblue', edgecolor='black')
plt.xlabel('age')
plt.ylabel('Frequency')
plt.title('Histogram of Age')
plt.grid(axis='y')

# Show the plot
plt.tight_layout()
plt.show()


#### 2. Feature engineering

1. Feature scaling
2. Aggregation
3. One hot coding

In [None]:
p35, p75

In [None]:
categorical_features

In [None]:
# one hot coding
df = pd.get_dummies(df, columns=categorical_features)

In [None]:
df.shape

In [None]:
# Calculate the 35th and 75th percentiles of the price
p35 = df['goals'].quantile(0.35)
p75 = df['goals'].quantile(0.75)

# Function to categorize prices
def categorize_price(price):
    if price < p35:
        return 'less goals'
    elif price < p75:
        return 'moderate goals'
    else:
        return 'low goals'


# Verify the distribution of the new categories
print(df['goals'].value_counts())

In [None]:
encoder = LabelEncoder()
df['goals'] = encoder.fit_transform(df['goals'])  

In [None]:
#df['car_price_category'] 

In [None]:
#encoder = LabelEncoder()
#df['car_price_category_encoded'] = encoder.fit_transform(df['car_price_category'])  

#### 3. Feature selection

In [None]:
sns.heatmap(df.corr(), annot=True);

In [None]:
correlation = df.corr(numeric_only=True)
#print(correlation['Price'].sort_values(ascending=False))
print(correlation['position_encoded'].sort_values(ascending=False))

In [None]:
# Set the correlation threshold
threshold = 0.2  # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['position_encoded']) > \
threshold]['position_encoded'].index
selected_features

In [None]:
selected_features = ['height', 'goals', 'assists', 'goals conceded', 'position_encoded',
       'position_Attack Centre-Forward', 'position_Attack-LeftWinger',
       'position_Attack-RightWinger', 'position_Defender Centre-Back',
       'position_Defender Left-Back', 'position_Defender Right-Back',
       'position_Goalkeeper']

In [None]:
df = df[selected_features]
df.head()

#### 4. Prepare train and test data

In [None]:
# Prepare data
X = df.drop(['position_encoded'], axis=1)
y = df['position_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

In [None]:
X.shape

## Buliding the Model

In [None]:
dt_classifier = DecisionTreeClassifier()
rf_classifier = RandomForestClassifier()

In [None]:
param_grid = {
    'max_depth': [4, 5, 6, 7, 8],   # adjust tree depth
    'n_estimators':[35, 40, 50, 60]      # adjust num trees to build before taking the maximum voting or averages of predictions
}
grid_search = GridSearchCV(estimator=rf_classifier,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro', 
                           verbose=1)

## Train the Model

In [None]:
# Fit the model on the training data
dt_classifier.fit(X_train, y_train)

In [None]:
rf_classifier.fit(X_train, y_train)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_

## Test the Model

In [None]:
# Predict and evaluate the model
y_pred_dt = dt_classifier.predict(X_test)
y_pred_rf = rf_classifier.predict(X_test)
y_pred_rf_g = best_model.predict(X_test)

## Interpretation of the Model

In [None]:
# Each row of the model.coef_ matrix tells you how each feature affects being in the corresponding class versus any other class
coeff_df = pd.DataFrame(model.coef_[2],X.columns,columns=['Coefficient'])
coeff_df

In [None]:
model.coef_

In [None]:
print(model.intercept_) 

## Evaluating the Model 

In [None]:
# our benchmark model
base_model = round(df['position_encoded'].value_counts()[1]/df.shape[0]*100, 2)
base_model

1. Accuracy

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred_train = model.predict(X_train_scaled)
accuracy = accuracy_score(y_train, y_pred_train)
accuracy

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(y_test, y_pred,
                                        display_labels=model.classes_,
                                        cmap="Greens",
                                        xticks_rotation='vertical')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Calculate Precision
precision = precision_score(y_test, y_pred, average='macro')
print(f"Precision: {precision:.2f}")

In [None]:
# Calculate Recall
recall = recall_score(y_test, y_pred, average='macro')
print(f"Recall: {recall:.2f}")

In [None]:
# Calculate F1 Score
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1:.2f}")