In [2]:
# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import dtale
import math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.metrics import silhouette_score

%matplotlib inline

In [3]:
df = pd.read_csv('final_data.csv')

In [4]:
df.shape

(10754, 22)

In [5]:
df.head(2)

Unnamed: 0,player,team,name,position,height,age,appearance,goals,assists,yellow cards,...,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,highest_value,position_encoded,winger
0,/david-de-gea/profil/spieler/59377,Manchester United,David de Gea,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,...,1.217252,0.335463,9390,42,5,13,15000000,70000000,1,0
1,/jack-butland/profil/spieler/128899,Manchester United,Jack Butland,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,...,1.242331,0.207055,1304,510,58,1,1500000,22000000,1,0


In [6]:
# uniqeness
df.duplicated().sum()

0

In [7]:
# Completeness
df.isnull().sum()

player                 0
team                   0
name                   0
position               0
height                 0
age                    0
appearance             0
goals                  0
assists                0
yellow cards           0
second yellow cards    0
red cards              0
goals conceded         0
clean sheets           0
minutes played         0
days_injured           0
games_injured          0
award                  0
current_value          0
highest_value          0
position_encoded       0
winger                 0
dtype: int64

In [8]:
# Accurecy types
df.dtypes

player                  object
team                    object
name                    object
position                object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
position_encoded         int64
winger                   int64
dtype: object

In [9]:
# drop column player we dont need it 
# drop column position becuse we have position_encoded
df.drop('player',axis=1,inplace = True) 
df.drop('position',axis=1,inplace = True)
df.drop('name',axis=1,inplace= True) 
df.dtypes

team                    object
height                 float64
age                    float64
appearance               int64
goals                  float64
assists                float64
yellow cards           float64
second yellow cards    float64
red cards              float64
goals conceded         float64
clean sheets           float64
minutes played           int64
days_injured             int64
games_injured            int64
award                    int64
current_value            int64
highest_value            int64
position_encoded         int64
winger                   int64
dtype: object

In [10]:
# Accurecy - Outlier

# Convert categorical variables using OneHotEncoding
# position is alrady encoded

categorical_features = ['team']
numeric_features = ['height', 'age', 'appearance','goals', 'assists', 'yellow cards', 'second yellow cards', 'red cards',
       'goals conceded', 'clean sheets', 'minutes played', 'days_injured',
       'games_injured', 'award', 'current_value', 'highest_value',
       'position_encoded', 'winger']
       

In [11]:
for i in categorical_features:
    print (df[i].value_counts())

team
Daejeon Hana Citizen      46
Jeonbuk Hyundai Motors    46
FC Seoul                  45
Gangwon FC                43
Daegu FC                  42
                          ..
FC Barcelona              22
Atlético de Madrid        22
CA Osasuna                22
Hatayspor                 12
Gaziantep FK               6
Name: count, Length: 374, dtype: int64


In [12]:
for i in numeric_features:
    print(df[i].value_counts())

height
180.000000    744
185.000000    608
175.000000    596
178.000000    594
183.000000    572
188.000000    493
184.000000    458
182.000000    449
181.240353    440
186.000000    423
187.000000    419
176.000000    390
177.000000    372
190.000000    371
173.000000    366
179.000000    359
181.000000    355
170.000000    312
191.000000    272
174.000000    270
189.000000    255
172.000000    254
192.000000    203
193.000000    181
171.000000    169
168.000000    124
194.000000    117
169.000000    107
195.000000     98
167.000000     85
196.000000     68
166.000000     44
165.000000     42
197.000000     31
198.000000     24
163.000000     16
160.000000     15
164.000000     14
199.000000     12
200.000000     12
202.000000      6
162.000000      6
161.000000      3
206.000000      2
204.000000      1
156.000000      1
159.000000      1
Name: count, dtype: int64
age
22.000000    841
23.000000    823
26.000000    812
25.000000    772
24.000000    767
21.000000    715
28.000000    67

In [13]:
df = df[df['age']<40]
df = df[df["appearance"]<100]
df = df[df["goals"]>0.5]
df = df[df['award']<5]
df = df[df["days_injured"]<407]
df = df[df['games_injured']<60]
df = df[df["minutes played"]<8000]

In [14]:
df = df[['team','goals','appearance','assists','current_value']]

In [15]:
df = pd.get_dummies(df, columns=['team'])

In [16]:
X = df


# sacle the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [17]:
# 2. Use NearestNeighbors to find the optimal eps
min_samples = 2

In [18]:
nn = NearestNeighbors(n_neighbors=min_samples)
nn.fit(X_scaled)
distances, indices = nn.kneighbors(X_scaled)

In [19]:
distances[:,min_samples-1]

array([20.90080801, 20.58742245,  1.0065514 ,  1.0065514 ,  2.54965454,
        2.54965454, 20.90009455, 20.47157509,  0.72432747,  0.72432747,
       20.6642298 , 20.58794892,  0.55804796,  0.55804796,  1.0329952 ,
        1.27785777,  1.0329952 , 20.45938265,  0.71826195,  0.71826195,
        0.77394693, 20.45566459, 20.55984923, 20.48404133, 20.5394752 ,
        2.21391637,  2.21391637,  4.03812887,  4.03812887,  2.75700702,
        2.75700702, 20.88556157, 20.59226213, 20.47316249, 20.47062847,
       20.75792771, 20.70688452,  1.10374227,  1.10374227,  1.16783231,
        1.16783231, 20.52489463, 20.59352816, 21.28470535, 20.52014945,
       20.87022326, 21.2208016 , 20.85810541,  1.54535617,  3.56817442,
        1.54535617, 20.63202957, 20.89835229, 20.53509201,  1.05603404,
        1.29358415,  1.05603404, 20.45193263,  0.47076718,  0.47076718,
       20.44664433,  0.70218944,  8.06646711,  0.70218944, 20.92665003,
        3.31998253,  3.31998253,  1.32913089,  1.32913089,  1.99

In [34]:
k_dist = distances[:, min_samples-1]

# Sort distances
k_dist_sorted = np.sort(k_dist)

# Create a color scale based on the sorted distances
colors = px.colors.sequential.Viridis

# Create a scatter plot with a color gradient
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=np.arange(len(k_dist_sorted)),
    y=k_dist_sorted,
    mode='markers+lines',
    marker=dict(
        color=k_dist_sorted,
        colorscale=colors,
        showscale=True
    )
))

# Update layout for better visualization
fig.update_layout(
    title='K-Distance Graph',
    xaxis_title='Points sorted by distance',
    yaxis_title='k-distance (eps value)',
    template='plotly_white'
)

# Show the plot
fig.show()

In [21]:
# 4. Apply DBSCAN with chosen eps and min_samples
# Typically, you choose eps at the "elbow" point in this plot, where the slope changes significantly
chosen_eps = 0.4  # This is an example; choose this value based on the plot
dbscan = DBSCAN(eps=chosen_eps, min_samples=min_samples)

In [22]:
clusters = dbscan.fit_predict(X_scaled)

In [23]:
pd.Series(clusters).value_counts()

-1    348
 0      2
 1      2
 2      2
 3      2
 4      2
 5      2
 6      2
 7      2
Name: count, dtype: int64

In [24]:
silhouette_avg = silhouette_score(X, clusters)
silhouette_avg

-0.7047421608059443

In [25]:
X.head(2)

Unnamed: 0,goals,appearance,assists,current_value,team_1.FC Köln,team_1.FC Union Berlin,team_1.FSV Mainz 05,team_AA Argentinos Juniors,team_AC Ajaccio,team_ACF Fiorentina,...,team_Vitesse Arnhem,team_WSG Tirol,team_Watford FC,team_West Bromwich Albion,team_Western Sydney Wanderers,team_Western United FC,team_Wolfsberger AC,team_Yokohama F. Marinos,team_Zenit St. Petersburg,team_Ümraniyespor
26,0.52573,72,0.300417,25000000,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
161,0.542714,58,0.226131,12000000,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [26]:
X.columns

Index(['goals', 'appearance', 'assists', 'current_value', 'team_1.FC Köln',
       'team_1.FC Union Berlin', 'team_1.FSV Mainz 05',
       'team_AA Argentinos Juniors', 'team_AC Ajaccio', 'team_ACF Fiorentina',
       ...
       'team_Vitesse Arnhem', 'team_WSG Tirol', 'team_Watford FC',
       'team_West Bromwich Albion', 'team_Western Sydney Wanderers',
       'team_Western United FC', 'team_Wolfsberger AC',
       'team_Yokohama F. Marinos', 'team_Zenit St. Petersburg',
       'team_Ümraniyespor'],
      dtype='object', length=236)

In [27]:
fig = px.scatter(
    x=X_scaled[:, 0],
    y=X_scaled[:, 1],
    color=clusters,
    color_continuous_scale='Viridis',
    title=f'DBSCAN Clustering with eps={chosen_eps}',
    labels={'x': 'Feature 1', 'y': 'Feature 2', 'color': 'Cluster Label'}
)

# Update marker size and edge color
fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')))

# Update layout for better visualization
fig.update_layout(
    xaxis_title='Feature 1',
    yaxis_title='Feature 2',
    coloraxis_colorbar=dict(title='Cluster Label'),
    template='plotly_white'
)

# Show the plot
fig.show()

In [28]:
X['dbscan'] = clusters

In [29]:
X['dbscan'].value_counts()

dbscan
-1    348
 0      2
 1      2
 2      2
 3      2
 4      2
 5      2
 6      2
 7      2
Name: count, dtype: int64

In [30]:
fig = px.scatter(
    data_frame=X,
    x='goals',
    y='team_Yokohama F. Marinos',
    color='dbscan',
    color_continuous_scale='Viridis',
    title='DBSCAN Clustering: Goals vs. Team Yokohama F. Marinos',
    labels={'goals': 'Goals', 'team_Yokohama F. Marinos': 'Team Yokohama F. Marinos', 'color': 'Cluster Label'}
)

# Update layout for better visualization
fig.update_layout(
    xaxis_title='Goals',
    yaxis_title='Team Yokohama F. Marinos',
    coloraxis_colorbar=dict(title='Cluster Label'),
    template='plotly_white'
)

# Show the plot
fig.show()


In [31]:
fig = px.scatter(
    data_frame=X,
    x='goals',
    y='assists',
    color='dbscan',
    color_continuous_scale='Viridis',
    title='DBSCAN Clustering: Goals vs. Assists',
    labels={'goals': 'Goals', 'assists': 'Assists', 'color': 'Cluster Label'}
)

# Update layout for better visualization
fig.update_layout(
    xaxis_title='Goals',
    yaxis_title='Assists',
    coloraxis_colorbar=dict(title='Cluster Label'),
    template='plotly_white'
)

# Show the plot
fig.show()


In [35]:
import joblib
joblib.dump(dbscan , 'Model_7.joblib')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']