In [None]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics

%matplotlib inline

## Load the data
The read_csv() function can now be used to load the dataset into pandas and mount it to a disk. This allows Colab to access the drive and transforms the CSV file into a Pandas dataframe.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/final_data.csv")

## View the dataframe
The shape, head, tail, and columns methods of the panadas functions allow us to quickly gauge the size of our dataset. These techniques help us gain a deeper understanding of the data.


df.head(2)

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
df.columns

## Profiling Data
The practice of thoroughly analyzing the data in an existing dataset and gathering statistics and information about it is known as data profiling.

# Preprocess The Data

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df["team"].value_counts(ascending=False)

In [None]:
df["award"].value_counts(ascending=False)

In [None]:
df["winger"].unique()

In [None]:
df["position_encoded"].unique()

In [None]:
df["height"].unique()

In [None]:
df.shape

## Data Quality Checks
 involve the process of ensuring that the data is accurate, complete, consistent, relevant, and reliable

Here are typical steps involved in checking data quality

 **1.Reliability** :

Evaluate the data's source and collection process to determine its trustworthiness

In [None]:
# Licensed in kaggle

**2.Timeliness**:
 Ensure the data is up-to-date and reflective of the current situation or the period of interest for the analysis

In [None]:
#in 2 season 2021-2022 and 2022-2023

**3.Consistency**:

*Confirm* that the data is consistent within the dataset and across multiple data sources.For example, the same data point should not have different values in different places

In [None]:
#Consistent

**4.Relevance:**

Assess whether the data is appropriate and applicable for the intended analysis. Data that is not relevant can skew results and lead to incorrect conclusions.

**The following are important factors for relevance:**

1. Sample Appropriateness: Verify that the goals of your analysis are met by the data sample you have chosen. For example, using statistics from the Northern region will not provide reliable information for the Kingdom's Western region.


2. Variable Selection: We can use the drop() technique to remove any columns that are irrelevant to our study. Since we are working with columns, we will set the "axis" parameter to 1. To make the change permanent, we will set the "inplace" argument to True.





In [None]:
# the data is relevant

**5. Uniqueness:**

Check for and remove duplicate records to prevent skewed analysis results.

In [None]:
df.duplicated().sum()
# no duplicates columns

In [None]:
df.isnull().sum()

In [None]:
# check columns types
df.dtypes

In [None]:
df.columns

In [None]:
df.shape

In [None]:
# uniqeness
df.drop_duplicates(inplace=True)
df[df.duplicated()]

In [None]:
# Completeness
df.isnull().sum()

In [None]:
# Accurecy types
df.dtypes

In [None]:
df.drop(columns=["team",'yellow cards', 'second yellow cards', 'red cards',"name", "player",'goals conceded','clean sheets'], inplace=True)

In [None]:
#drop whatever doesn't consider normal as a football player desription

df.drop(df[(df['age'] <= 18) | (df['age'] >= 38)].index, inplace=True)
df.drop(df[(df['current_value'] == 0) | (df['highest_value'] == 0)].index, inplace=True)
df.drop(df[(df['appearance'] == 0.00)].index, inplace=True)
df.drop(df[(df['height'] <= 140) | (df['height'] > 200)].index, inplace=True)
df.drop(df[df['current_value'] >= 50000000].index, inplace=True)
df.drop(df[df['current_value'] < 10000].index, inplace=True)
df.drop(df[df['highest_value'] >= 50000000].index, inplace=True)
df.drop(df[df['highest_value'] < 10000].index, inplace=True)
df.drop(df[(df['goals'] == 0.00)].index, inplace=True)

In [None]:
df.shape

**B. Feature engineering**


1.Feature scaling            
2.Aggregation                        
3.One hot coding


In [None]:
df.head(2)

In [None]:
categorical_features = ['position']
numeric_features = ['height', 'age', 'appearance', 'goals', 'assists',
       'yellow cards', 'second yellow cards', 'red cards', 'goals conceded',
       'clean sheets', 'minutes played', 'days_injured', 'games_injured',
       'award', 'current_value', 'highest_value', 'winger']

In [None]:
for i in categorical_features:
    print (df[i].value_counts())

In [None]:
# one hot coding
df = pd.get_dummies(df, columns=categorical_features)

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
# Calculate the 25th and 85th percentiles of 'current_value'
p25 = df['current_value'].quantile(0.25)
p65 = df['current_value'].quantile(0.65)

# Function to categorize 'current_value' based on percentiles
def categorize_current_value(current_value):
    if current_value < p25:
        return 'Cheap'
    elif current_value < p65:
        return 'Moderate'
    else:
        return 'Expensive'

# Apply the function to create a new column 'current_value_category'
df['current_value_category'] = df['current_value'].apply(categorize_current_value)

# Optionally, drop the original 'current_value' column
df.drop('current_value', axis=1, inplace=True)

# Verify the distribution of the new categories
print(df['current_value_category'].value_counts())

In [None]:
p25, p65

In [None]:
encoder = LabelEncoder()
df['current_value_category'] = encoder.fit_transform(df['current_value_category'])

In [None]:
df.columns

 **C. Feature selection**

In [None]:
correlation = df.corr(numeric_only=True)
print(correlation['current_value_category'].sort_values(ascending=False))

In [None]:
# Set the correlation threshold
threshold = 0.09 # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['current_value_category']) > \
threshold]['current_value_category'].index
selected_features

In [None]:
selected_features =['appearance', 'minutes played', 'current_value_category']

In [None]:
df.columns

In [None]:
df.head()

## Data should be prepared.

In [None]:
# Prepare data
X = df


# sacle the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X.shape

## Buliding the Model

In [None]:
wcss = []
for i in range(1, 30):  # Test 1 to 10 clusters
    kmeans = KMeans(n_clusters=i,
                    init='k-means++',
                    max_iter=300,
                    n_init=10,
                    random_state=0)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)  # inertia_ is the WCSS

In [None]:
# 3. Plot the Elbow Curve
plt.figure(figsize=(10, 6))
plt.plot(range(1, 30), wcss,  marker='o', linestyle='-', color='b')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid(True)
plt.show()

In [None]:
# Optional: Apply k-Means using the optimal number of clusters
optimal_clusters = 8 # This might change based on your actual elbow curve analysis
model = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)

## Train and Test the Model

In [None]:
# Fit the model on the training data
y_pred = model.fit_predict(X_scaled)

## Evaluating the Model

In [None]:
X.head(2)

In [None]:
# Plotting the clusters with optimal number
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y_pred, s=50, cmap='viridis')

centers = model.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X')
plt.title(f'K-Means Clustering with {optimal_clusters} Clusters')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
X['kmeans'] = y_pred

In [None]:
X['kmeans'].value_counts()

In [None]:
X[X['kmeans']==0]

In [None]:
X.columns

In [None]:
sns.scatterplot(data = X , x = 'minutes played', y = 'age', hue = 'kmeans')

In [None]:
sns.scatterplot(data = X , x = 'minutes played', y = 'award', hue = 'kmeans')

In [None]:
import plotly
import plotly.express as px

fig = px.scatter_3d(X, x='minutes played', y='age', z='highest_value',
              color='kmeans')
fig.show()
