## Density-Based Cluster Validity (Dcv)
    i. It evaluates clusters produced by density based clustering algorithms such as DBSCAN and HDBSCAN.
    ii. The fundamental idea of Dcv is to compare the density inside clusters and the density ouside the clusters (overall dataset).


In [1]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import hdbscan

In [8]:
# Function to calculate intra-cluster density
def intra_cluster_density(X, labels):
    densities = []
    unique_labels = set(labels)
    for label in unique_labels:
        if label == -1:  # Ignore noise points
            continue
       #This selects all the rows (data points) from X where the corresponding value in labels is
       #equal to the current label
        cluster_points = X[labels == label]
        pairwise_distances = euclidean_distances(cluster_points)
        cluster_density = np.mean(pairwise_distances)
        densities.append(cluster_density)
    return np.mean(densities)


In [9]:
# Function to calculate overall dataset density
def overall_density(X):
   pairwise_distances = euclidean_distances(X)
   return np.mean(pairwise_distances)


In [11]:
# Dcv index calculation
def density_based_cluster_validity(X, labels):
    intra_density = intra_cluster_density(X, labels)
    overall_dens = overall_density(X)
    return intra_density / overall_dens


In [16]:
# Sample Data

from sklearn.datasets import make_moons
X, _ = make_moons(n_samples=300, noise=0.05)

# Scaling the data
X = StandardScaler().fit_transform(X)

In [17]:
# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.2, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)


In [18]:
# Calculate Dcv for DBSCAN clusters
dcv_dbscan = density_based_cluster_validity(X, dbscan_labels)
print(f"Dcv for DBSCAN: {dcv_dbscan}")


Dcv for DBSCAN: 0.711837958752822


In [19]:
# Apply HDBSCAN clustering (optional)
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
#hdbscan_labels = hdbscan_clusterer.fit_predict(X)
hdbscan_labels = hdbscan_clusterer.fit_predict(X)


In [20]:
# Calculate Dcv for HDBSCAN clusters
dcv_hdbscan = density_based_cluster_validity(X, hdbscan_labels)
print(f"Dcv for HDBSCAN: {dcv_hdbscan}")

Dcv for HDBSCAN: 0.7168203469301986
