In [None]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics

%matplotlib inline

## Loading the data
We can now load the dataset into pandas using the read_csv() function. This converts the CSV file into a Pandas dataframe.

In [None]:
#Read in the csv file and convert to a Pandas dataframe
df_player=pd.read_csv(r"C:\Users\Eknjz\Desktop\DATA SCIENCE AND MACHINE LEARNING\DAY21\Usecase-7\final_data.csv")

### Viewing the dataframe
We can get a quick sense of the size of our dataset by using the shape method. This returns a tuple with the number of rows and columns in the dataset.

In [None]:
df_player.head()

In [None]:
df_player.shape # the dataframe shape

## 1. Data Profiling:
Data profiling is a comprehensive process of examining the data available in an existing dataset and collecting statistics and information about that data. 

In [42]:
df_player.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10754 entries, 0 to 10753
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   team              10754 non-null  object 
 1   name              10754 non-null  object 
 2   age               10754 non-null  float64
 3   appearance        10754 non-null  int64  
 4   goals             10754 non-null  float64
 5   assists           10754 non-null  float64
 6   red cards         10754 non-null  float64
 7   goals conceded    10754 non-null  float64
 8   clean sheets      10754 non-null  float64
 9   minutes played    10754 non-null  int64  
 10  days_injured      10754 non-null  int64  
 11  games_injured     10754 non-null  int64  
 12  award             10754 non-null  int64  
 13  current_value     10754 non-null  int64  
 14  highest_value     10754 non-null  int64  
 15  position_encoded  10754 non-null  int64  
dtypes: float64(6), int64(8), object(2)
memor

In [43]:
df_player.describe(include='object')

Unnamed: 0,team,name
count,10754,10754
unique,374,10584
top,Daejeon Hana Citizen,Paulinho
freq,46,6


In [44]:
df_player.columns 

Index(['team', 'name', 'age', 'appearance', 'goals', 'assists', 'red cards',
       'goals conceded', 'clean sheets', 'minutes played', 'days_injured',
       'games_injured', 'award', 'current_value', 'highest_value',
       'position_encoded'],
      dtype='object')

### Data Quality Checks
Data quality checks involve the process of ensuring that the data is accurate, complete, consistent, relevant, and reliable. 


**Here are typical steps involved in checking data quality:**

#### 1. Reliability:
Evaluate the data's source and collection process to determine its trustworthiness.

In [None]:
# in the kaggle page mentioned data source

#### 2. Timeliness: 
Ensure the data is up-to-date and reflective of the current situation or the period of interest for the analysis.

#### 3. Consistency: 

Confirm that the data is consistent within the dataset and across multiple data sources. For example, the same data point should not have different values in different places.


In [None]:
# check one of the files and the big file 

#### 4. Relevance: 
Assess whether the data is appropriate and applicable for the intended analysis. Data that is not relevant can skew results and lead to incorrect conclusions.

**Key considerations for relevance include:**

> 1. Sample Appropriateness: Confirm that your data sample aligns with your analysis objectives. For instance, utilizing data from the Northern region will not yield accurate insights for the Western region of the Kingdom.
>
> 2. Variable Selection: Any column will not be relevant for our analysis, we can get rid of these using the drop() method. We will set the “axis” argument to 1 since we’re dealing with columns, and set the “inplace” argument to True to make the change permanent.


In [None]:
df_player.shape

#### 5. Uniqueness: 
Check for and remove duplicate records to prevent skewed analysis results.


In [45]:
df_player.head()

Unnamed: 0,team,name,age,appearance,goals,assists,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded
0,Manchester United,David de Gea,32.0,104,0.0,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,70000000,1
1,Manchester United,Jack Butland,30.0,15,0.0,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000,1
2,Manchester United,Tom Heaton,37.0,4,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,6000000,1
3,Manchester United,Lisandro Martínez,25.0,82,0.02809,0.05618,0.0,0.0,0.0,6408,175,22,9,50000000,50000000,2
4,Manchester United,Raphaël Varane,30.0,63,0.017889,0.017889,0.0,0.0,0.0,5031,238,51,21,40000000,80000000,2


In [46]:
df_player.duplicated().sum() # -> 0 duplicated rows

0

In [47]:
df_player.drop_duplicates(inplace=True)
df_player[df_player.duplicated()]  # ->NO duplicated rows

Unnamed: 0,team,name,age,appearance,goals,assists,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded


In [48]:
 df_player.shape # the same shape of dataset

(10754, 16)

#### 6. Completeness: 
Ensure that no critical data is missing. This might mean checking for null values or required fields that are empty.

We will start by checking the dataset for missing or null values. For this, we can use the isna() method which returns a dataframe of boolean values indicating if a field is null or not. To group all missing values by column, we can include the sum() method.

In [49]:
#Display number missing values per column
df_player.isnull().sum().sort_values(ascending = False)

team                0
name                0
age                 0
appearance          0
goals               0
assists             0
red cards           0
goals conceded      0
clean sheets        0
minutes played      0
days_injured        0
games_injured       0
award               0
current_value       0
highest_value       0
position_encoded    0
dtype: int64

#### 7. Check Accuracy:

Verify that the data is correct and precise. This could involve comparing data samples with known sources or using validation rules.

**The process includes:**
1. Validating the appropriateness of data types for the dataset.
2. Identifying outliers  using established validation  rule

In [50]:
# check columns types 
df_player.dtypes

team                 object
name                 object
age                 float64
appearance            int64
goals               float64
assists             float64
red cards           float64
goals conceded      float64
clean sheets        float64
minutes played        int64
days_injured          int64
games_injured         int64
award                 int64
current_value         int64
highest_value         int64
position_encoded      int64
dtype: object

In [51]:
 df_player.head()

Unnamed: 0,team,name,age,appearance,goals,assists,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded
0,Manchester United,David de Gea,32.0,104,0.0,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,70000000,1
1,Manchester United,Jack Butland,30.0,15,0.0,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000,1
2,Manchester United,Tom Heaton,37.0,4,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,6000000,1
3,Manchester United,Lisandro Martínez,25.0,82,0.02809,0.05618,0.0,0.0,0.0,6408,175,22,9,50000000,50000000,2
4,Manchester United,Raphaël Varane,30.0,63,0.017889,0.017889,0.0,0.0,0.0,5031,238,51,21,40000000,80000000,2


In [52]:
df_player.columns

Index(['team', 'name', 'age', 'appearance', 'goals', 'assists', 'red cards',
       'goals conceded', 'clean sheets', 'minutes played', 'days_injured',
       'games_injured', 'award', 'current_value', 'highest_value',
       'position_encoded'],
      dtype='object')

In [53]:
#Delete all un used columns
#Delete unrelevent columns
df_player.drop(['player','height','yellow cards','second yellow cards','position','winger'],axis=1,inplace=True)

KeyError: "['player', 'height', 'yellow cards', 'second yellow cards', 'position', 'winger'] not found in axis"

In [54]:
# check changes
df_player.head(2)

Unnamed: 0,team,name,age,appearance,goals,assists,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded
0,Manchester United,David de Gea,32.0,104,0.0,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,70000000,1
1,Manchester United,Jack Butland,30.0,15,0.0,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,22000000,1


In [55]:
df_player.shape

(10754, 16)

In [None]:
profile = ProfileReport(df_player, title="Data Profiling Report", explorative=True)
profile.to_notebook_iframe()

**What is an Outlier?** 
Outlier is an row/observation that appears far away and diverges from an overall pattern in a sample.

**What are the types of Outliers?**
1. Univariate: These outliers can be found when we look at distribution of a single variable
2. Multivariate: are outliers in an n-dimensional space. In order to find them, you have to look at distributions in multi-dimensions. example (hight=100, weight=100) for a person

**What causes Outliers?**
Whenever we come across outliers, the ideal way to tackle them is to find out the reason of having these outliers. The method to deal with them would then depend on the reason of their occurrence.

Let’s understand various types of outliers:

1. Data Entry Errors:- Human errors such as errors caused during data collection, recording, or entry can cause outliers in data.
2. Measurement Error: It is the most common source of outliers. This is caused when the measurement instrument used turns out to be faulty.
3. Data Processing Error: Whenever we perform data mining, we extract data from multiple sources. It is possible that some manipulation or extraction errors may lead to outliers in the dataset.
4. Sampling error: For instance, we have to measure the height of athletes. By mistake, we include a few basketball players in the sample. This inclusion is likely to cause outliers in the dataset.
5. Natural Outlier: When an outlier is not artificial (due to error), it is a natural outlier. For instance: In my last assignment with one of the renowned insurance company, I noticed that the performance of top 50 financial advisors was far higher than rest of the population. Surprisingly, it was not due to any error. Hence, whenever we perform any data mining activity with advisors, we used to treat this segment separately.


**What is the impact of Outliers on a dataset?**


![image.png](https://www.analyticsvidhya.com/wp-content/uploads/2015/02/Outlier_31.png)


[![](https://markdown-videos-api.jorgenkh.no/youtube/aUKfwkHfgiA)](https://www.youtube.com/watch?v=aUKfwkHfgiA)

**How to detect Outliers?**

1. Most commonly used method to detect outliers is visualization (Univariate Graphical Analysis).

We use 3 common visualization methods:
>- Box-plot: A box plot is a method for graphically depicting groups of numerical data through their quartiles. The box extends from the Q1 to Q3 quartile values of the data, with a line at the median (Q2). The whiskers extend from the edges of the box to show the range of the data. Outlier points are those past the end of the whiskers. Box plots show robust measures of location and spread as well as providing information about symmetry and outliers.
>
>  
>![image.png](https://miro.medium.com/v2/resize:fit:698/format:webp/1*VK5iHA2AB28HSZwWwUbNYg.png)
>
>
>- Histogram
>- Scatter Plot: A scatter plot is a mathematical diagram using Cartesian coordinates to display values for two variables for a set of data. The data are displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis. The points that are far from the population can be termed as an outlier.
>
>  
>![image.png](https://miro.medium.com/v2/resize:fit:4800/format:webp/1*Ov6aH-8yIwNoUxtMFwgx4g.png)
>
>

2. Using statistical method (Univariate Non-Graphical analysis):
>- Any value, which is beyond the range of -1.5 x IQR to 1.5 x IQR
 
![image.png](https://www.whatissixsigma.net/wp-content/uploads/2015/07/Box-Plot-Diagram-to-identify-Outliers-figure-1.png)

>- Use capping methods. Any value which out of range of 5th and 95th percentile can be considered as outlier
>- Data points, three or more standard deviation away from mean are considered outlier: The Z-score is the signed number of standard deviations by which the value of an observation or data point is above the mean value of what is being observed or measured. While calculating the Z-score we re-scale and center the data and look for data points that are too far from zero. These data points which are way too far from zero will be treated as the outliers. In most of the cases, a threshold of 3 or -3 is used i.e if the Z-score value is greater than or less than 3 or -3 respectively, that data point will be identified as outliers.
> - Outlier detection is merely a special case of the examination of data for influential data points and it also depends on the business understanding


In [None]:
plt.figure(figsize=(15,15))
sns.pairplot(df_player)
plt.show()

In [None]:
plt.figure(figsize=(15,8))

plt.hist(df_player['appearance'], bins=100)
plt.show()


In [None]:
# find the outlier in appearance columns
plt.figure(figsize=(15,8))

plt.hist(df_player[(df_player['appearance']<=120) & (df_player['appearance']>5)]['appearance'], bins=100)
plt.show()


In [None]:
# delete outlier 
df_player=df_player[(df_player['appearance']<=120) & (df_player['appearance']>5)]

In [None]:
df_player.shape

In [None]:
plt.figure(figsize=(15,8))

plt.hist(df_player['goals'], bins=100)
plt.show()

In [None]:
plt.figure(figsize=(15,8))

plt.hist(df_player['assists'], bins=100)
plt.show()

In [None]:
plt.figure(figsize=(15,8))

plt.hist(df_player[(df_player['assists']<=0.4) & (df_player['assists']>0.0)]['assists'], bins=100)
plt.show()


In [None]:
df_player=df_player[(df_player['assists']<=0.4) & (df_player['assists']>0.0)]

In [None]:
df_player.shape

In [None]:
plt.figure(figsize=(15,8))

plt.hist(df_player['minutes played'], bins=100)
plt.show()                                            

In [None]:
plt.figure(figsize=(15,8))

plt.hist(df_player[df_player['minutes played']<9000]['minutes played'], bins=50)
plt.show()


In [None]:
df_player=df_player[df_player['minutes played']<9000]
df_player.shape

In [None]:
plt.figure(figsize=(15,8))
plt.hist(df_player['award'], bins=30)
plt.show()
                          

In [None]:

plt.figure(figsize=(15,8))

plt.hist(df_player[(df_player['award']<15) & (df_player['award']>1)]['award'], bins=30)
plt.show()


In [None]:
df_player=df_player[(df_player['award']<15) & (df_player['award']>1)]

In [None]:
df_player.shape

In [None]:
plt.figure(figsize=(15,8))
plt.hist(df_player['highest_value'], bins=30)
plt.show()
                                           

In [None]:
plt.figure(figsize=(15,8))

plt.hist(df_player[(df_player['highest_value']<0.9) & (df_player['highest_value']>0.1)]['highest_value'], bins=100) ## i can not delete more 
plt.show()


In [None]:
df_player.shape

In [None]:
df_player.columns

In [56]:

df_player.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10754 entries, 0 to 10753
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   team              10754 non-null  object 
 1   name              10754 non-null  object 
 2   age               10754 non-null  float64
 3   appearance        10754 non-null  int64  
 4   goals             10754 non-null  float64
 5   assists           10754 non-null  float64
 6   red cards         10754 non-null  float64
 7   goals conceded    10754 non-null  float64
 8   clean sheets      10754 non-null  float64
 9   minutes played    10754 non-null  int64  
 10  days_injured      10754 non-null  int64  
 11  games_injured     10754 non-null  int64  
 12  award             10754 non-null  int64  
 13  current_value     10754 non-null  int64  
 14  highest_value     10754 non-null  int64  
 15  position_encoded  10754 non-null  int64  
dtypes: float64(6), int64(8), object(2)
memor

In [57]:
categorical_features = ['team', 'name']
numeric_features = ['age ', 'appearance', 'goals','assists','red cards', 'goals conceded', 'clean sheets','minutes played  ', 'days_injured', 'games_injured',' award','current_value', 'highest_value', 'position_encoded']

In [58]:
# Calculate the 66 percentiles of the CURRENT_VALUE
p66 = df_player['current_value'].quantile(0.66)


# Function to categorize prices
def categorize_value(current_value):
    if current_value <=p66:
        return 'Good_Price'
        
    else:
        return 'High_Price'




In [59]:
# Apply the function to create a new column
df_player['player_price_category'] = df_player['current_value'].apply(categorize_value)
df_player.drop('current_value', axis=1, inplace=True)
# Verify the distribution of the new categories
print(df_player['player_price_category'].value_counts())

player_price_category
Good_Price    7218
High_Price    3536
Name: count, dtype: int64


In [60]:
p66

1800000.0

In [61]:
df_player['player_price_category']

0        High_Price
1        Good_Price
2        Good_Price
3        High_Price
4        High_Price
            ...    
10749    Good_Price
10750    Good_Price
10751    Good_Price
10752    Good_Price
10753    Good_Price
Name: player_price_category, Length: 10754, dtype: object

In [62]:
encoder = LabelEncoder()
df_player['player_price_category_encoded'] = encoder.fit_transform(df_player['player_price_category'])  

In [63]:
correlation = df_player.corr(numeric_only=True)
#print(correlation['Price'].sort_values(ascending=False))
print(correlation['player_price_category_encoded'].sort_values(ascending=False))

player_price_category_encoded    1.000000
highest_value                    0.489325
appearance                       0.478693
minutes played                   0.453392
award                            0.225064
games_injured                    0.192834
days_injured                     0.161281
assists                          0.141243
goals                            0.107091
position_encoded                 0.100955
red cards                       -0.011041
clean sheets                    -0.020957
age                             -0.034189
goals conceded                  -0.099790
Name: player_price_category_encoded, dtype: float64


In [64]:
threshold = 0.36  # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['player_price_category_encoded']) > \
threshold]['player_price_category_encoded'].index
selected_features

Index(['appearance', 'minutes played', 'highest_value',
       'player_price_category_encoded'],
      dtype='object')

In [None]:
#correlation = df_player.corr()
#correlation

In [65]:

print(correlation['player_price_category_encoded'].sort_values(ascending=False))

player_price_category_encoded    1.000000
highest_value                    0.489325
appearance                       0.478693
minutes played                   0.453392
award                            0.225064
games_injured                    0.192834
days_injured                     0.161281
assists                          0.141243
goals                            0.107091
position_encoded                 0.100955
red cards                       -0.011041
clean sheets                    -0.020957
age                             -0.034189
goals conceded                  -0.099790
Name: player_price_category_encoded, dtype: float64


In [66]:
# Set the correlation threshold
threshold = 0.27  # You can change this value based on your requirement

# Filter the correlations
# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['player_price_category_encoded']) > threshold]['player_price_category_encoded'].index
selected_features

Index(['appearance', 'minutes played', 'highest_value',
       'player_price_category_encoded'],
      dtype='object')

In [67]:
selected_features=['appearance', 'minutes played', 'award',
       'highest_value', 'player_price_category_encoded']
selected_features

['appearance',
 'minutes played',
 'award',
 'highest_value',
 'player_price_category_encoded']

In [68]:
df = df_player[selected_features]
df.head()

Unnamed: 0,appearance,minutes played,award,highest_value,player_price_category_encoded
0,104,9390,13,70000000,1
1,15,1304,1,22000000,0
2,4,292,4,6000000,0
3,82,6408,9,50000000,1
4,63,5031,21,80000000,1


In [69]:
# Prepare data
X = df.drop(['player_price_category_encoded'], axis=1)
y = df['player_price_category_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=42)

# sacle the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_test_scaled.shape

(2151, 4)

In [70]:
X.shape

(10754, 4)

In [None]:
model = KNeighborsClassifier()

In [None]:
param_grid = {
    'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_macro', 
                           verbose=1)

In [None]:
# Fit the model on the training data
grid_search.fit(X_train_scaled, y_train)

In [None]:
grid_search.best_params_

In [None]:
model = grid_search.best_estimator_

In [None]:
# Predict and evaluate the model
y_pred = model.predict(X_test_scaled)

In [None]:

base_model = round(df['player_price_category_encoded'].value_counts()[1]/df.shape[0]*100, 2)
base_model

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
y_pred_train = model.predict(X_train_scaled)
accuracy = accuracy_score(y_train, y_pred_train)
accuracy

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
print("Train classification report for best knn:")
print(f' {classification_report(y_train, y_pred_train, labels=list(model.classes_))}')
print("Test classification report for best knn: ")
print(f'{classification_report(y_test, y_pred, labels=list(model.classes_))}')

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(y_test, y_pred,
                                        display_labels=model.classes_,
                                        cmap="Blues",
                                        xticks_rotation='vertical')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Calculate Precision
precision = precision_score(y_test, y_pred, average='macro')
print(f"Precision: {precision:.2f}")

In [None]:
# Calculate Recall
recall = recall_score(y_test, y_pred, average='macro')
print(f"Recall: {recall:.2f}")

In [None]:
# Calculate F1 Score
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1:.2f}")