In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score


In [19]:
# Loading dataset with less PC than before because dbscan doesn't work well when there are too many dimensions with different scales
df = pd.read_csv('data/pca.csv').iloc[:, :2]  # Using only first 2 principal components

In [None]:
# ranges
eps_values = np.arange(0.1, 1.0, 0.2)

In [20]:
import warnings
warnings.filterwarnings("ignore")


eps_scores = {"eps":[], "silhouette_score":[]}

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(df)
    
    # 1. Filter out noise (-1) to see if valid clusters exist
    real_clusters = [x for x in labels if x != -1]
    unique_clusters = set(real_clusters)
    
    # 2. We need at least 2 distinct clusters (e.g., 0 and 1) to calculate a silhouette score
    if len(unique_clusters) > 1:
        # Calculate score using ALL points (including noise gives a truer representation)
        # OR filter noise out if you only care about dense regions.
        # Standard practice: Include noise (it penalizes the score, which is good) 
        # or calculate on non-noise only. Here is non-noise only:
        
        mask = labels != -1
        score = silhouette_score(df[mask], labels[mask])
        
        print(f"EPS: {eps:.2f}, Clusters: {len(unique_clusters)}, Score: {score:.3f}")
        eps_scores["eps"].append(eps)
        eps_scores["silhouette_score"].append(score)
    else:
        # Debug info
        print(f"EPS: {eps:.2f} -> Labels found: {set(labels)} (Not enough clusters)")

EPS: 0.10, Clusters: 647, Score: -0.434
EPS: 0.30, Clusters: 81, Score: 0.014
EPS: 0.50, Clusters: 18, Score: 0.127
EPS: 0.70, Clusters: 8, Score: -0.041
EPS: 0.90, Clusters: 2, Score: 0.590


In [23]:
# new ranges
eps_values = np.arange(1.0, 2.0, 0.2)

In [24]:
import warnings
warnings.filterwarnings("ignore")


eps_scores = {"eps":[], "silhouette_score":[]}

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(df)
    
    # 1. Filter out noise (-1) to see if valid clusters exist
    real_clusters = [x for x in labels if x != -1]
    unique_clusters = set(real_clusters)
    
    # 2. We need at least 2 distinct clusters (e.g., 0 and 1) to calculate a silhouette score
    if len(unique_clusters) > 1:
        # Calculate score using ALL points (including noise gives a truer representation)
        # OR filter noise out if you only care about dense regions.
        # Standard practice: Include noise (it penalizes the score, which is good) 
        # or calculate on non-noise only. Here is non-noise only:
        
        mask = labels != -1
        score = silhouette_score(df[mask], labels[mask])
        
        print(f"EPS: {eps:.2f}, Clusters: {len(unique_clusters)}, Score: {score:.3f}")
        eps_scores["eps"].append(eps)
        eps_scores["silhouette_score"].append(score)
    else:
        # Debug info
        print(f"EPS: {eps:.2f} -> Labels found: {set(labels)} (Not enough clusters)")

EPS: 1.00, Clusters: 2, Score: 0.588
EPS: 1.20, Clusters: 3, Score: 0.557
EPS: 1.40, Clusters: 2, Score: 0.574
EPS: 1.60, Clusters: 2, Score: 0.574
EPS: 1.80, Clusters: 2, Score: 0.573


In [25]:
# new ranges
eps_values = np.arange(0.3, 1.0, 0.05)

In [26]:
import warnings
warnings.filterwarnings("ignore")


eps_scores = {"eps":[], "silhouette_score":[]}

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(df)
    
    # 1. Filter out noise (-1) to see if valid clusters exist
    real_clusters = [x for x in labels if x != -1]
    unique_clusters = set(real_clusters)
    
    # 2. We need at least 2 distinct clusters (e.g., 0 and 1) to calculate a silhouette score
    if len(unique_clusters) > 1:
        # Calculate score using ALL points (including noise gives a truer representation)
        # OR filter noise out if you only care about dense regions.
        # Standard practice: Include noise (it penalizes the score, which is good) 
        # or calculate on non-noise only. Here is non-noise only:
        
        mask = labels != -1
        score = silhouette_score(df[mask], labels[mask])
        
        print(f"EPS: {eps:.2f}, Clusters: {len(unique_clusters)}, Score: {score:.3f}")
        eps_scores["eps"].append(eps)
        eps_scores["silhouette_score"].append(score)
    else:
        # Debug info
        print(f"EPS: {eps:.2f} -> Labels found: {set(labels)} (Not enough clusters)")

EPS: 0.30, Clusters: 81, Score: 0.014
EPS: 0.35, Clusters: 55, Score: -0.241
EPS: 0.40, Clusters: 33, Score: 0.180
EPS: 0.45, Clusters: 25, Score: 0.112
EPS: 0.50, Clusters: 18, Score: 0.127
EPS: 0.55, Clusters: 15, Score: 0.255
EPS: 0.60, Clusters: 11, Score: -0.088
EPS: 0.65, Clusters: 10, Score: -0.079
EPS: 0.70, Clusters: 8, Score: -0.041
EPS: 0.75, Clusters: 6, Score: -0.040
EPS: 0.80, Clusters: 7, Score: 0.492
EPS: 0.85, Clusters: 4, Score: 0.561
EPS: 0.90, Clusters: 2, Score: 0.590
EPS: 0.95, Clusters: 2, Score: 0.590


In [27]:
# new ranges
eps_values = np.arange(2.0, 3.0, 0.2)

In [28]:
import warnings
warnings.filterwarnings("ignore")


eps_scores = {"eps":[], "silhouette_score":[]}

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(df)
    
    # 1. Filter out noise (-1) to see if valid clusters exist
    real_clusters = [x for x in labels if x != -1]
    unique_clusters = set(real_clusters)
    
    # 2. We need at least 2 distinct clusters (e.g., 0 and 1) to calculate a silhouette score
    if len(unique_clusters) > 1:
        # Calculate score using ALL points (including noise gives a truer representation)
        # OR filter noise out if you only care about dense regions.
        # Standard practice: Include noise (it penalizes the score, which is good) 
        # or calculate on non-noise only. Here is non-noise only:
        
        mask = labels != -1
        score = silhouette_score(df[mask], labels[mask])
        
        print(f"EPS: {eps:.2f}, Clusters: {len(unique_clusters)}, Score: {score:.3f}")
        eps_scores["eps"].append(eps)
        eps_scores["silhouette_score"].append(score)
    else:
        # Debug info
        print(f"EPS: {eps:.2f} -> Labels found: {set(labels)} (Not enough clusters)")

EPS: 2.00, Clusters: 2, Score: 0.573
EPS: 2.20, Clusters: 2, Score: 0.573
EPS: 2.40, Clusters: 2, Score: 0.571
EPS: 2.60, Clusters: 2, Score: 0.571
EPS: 2.80, Clusters: 2, Score: 0.571


In [29]:
# for more PC
eps_values = np.arange(0.1, 3.0, 0.2)
df = pd.read_csv('data/pca.csv').iloc[:, :3]  # Using 3 principal components

In [30]:
import warnings
warnings.filterwarnings("ignore")


eps_scores = {"eps":[], "silhouette_score":[]}

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=5)
    labels = dbscan.fit_predict(df)
    
    # 1. Filter out noise (-1) to see if valid clusters exist
    real_clusters = [x for x in labels if x != -1]
    unique_clusters = set(real_clusters)
    
    # 2. We need at least 2 distinct clusters (e.g., 0 and 1) to calculate a silhouette score
    if len(unique_clusters) > 1:
        # Calculate score using ALL points (including noise gives a truer representation)
        # OR filter noise out if you only care about dense regions.
        # Standard practice: Include noise (it penalizes the score, which is good) 
        # or calculate on non-noise only. Here is non-noise only:
        
        mask = labels != -1
        score = silhouette_score(df[mask], labels[mask])
        
        print(f"EPS: {eps:.2f}, Clusters: {len(unique_clusters)}, Score: {score:.3f}")
        eps_scores["eps"].append(eps)
        eps_scores["silhouette_score"].append(score)
    else:
        # Debug info
        print(f"EPS: {eps:.2f} -> Labels found: {set(labels)} (Not enough clusters)")

EPS: 0.10, Clusters: 676, Score: 0.173
EPS: 0.30, Clusters: 280, Score: -0.493
EPS: 0.50, Clusters: 149, Score: -0.138
EPS: 0.70, Clusters: 97, Score: 0.213
EPS: 0.90, Clusters: 44, Score: -0.351
EPS: 1.10, Clusters: 17, Score: -0.063
EPS: 1.30, Clusters: 14, Score: -0.210
EPS: 1.50, Clusters: 5, Score: 0.127
EPS: 1.70, Clusters: 4, Score: 0.312
EPS: 1.90, Clusters: 3, Score: 0.323
EPS: 2.10, Clusters: 4, Score: 0.313
EPS: 2.30, Clusters: 4, Score: 0.312
EPS: 2.50, Clusters: 4, Score: 0.312
EPS: 2.70, Clusters: 3, Score: 0.418
EPS: 2.90, Clusters: 3, Score: 0.417


**When only 2 PC**
Best number for EPS:0.90
because Score: 0.590

**When only 3 PC**
Best number for EPS: 2.70
because Score: 0.418