# Checking the efficiency of different clustering algorithms using the Silhouette score by taking differnt amount of same dataset.

# Installing the requirements

In [2]:
pip install hdbscan

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Import the libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans #for k-means
from sklearn.cluster import AgglomerativeClustering #for Hierarchical clustering
from sklearn.cluster import DBSCAN #for DBSCAN clustering
from sklearn.cluster import SpectralClustering #for Spectral clustering
from sklearn.cluster import MeanShift #for Mean-shift clustering
from sklearn.cluster import AffinityPropagation #for Affinity propagation clustering
from sklearn.cluster import OPTICS #OPTICS (Ordering Points To Identify the Clustering Structure)
import hdbscan #for HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)
from sklearn.mixture import GaussianMixture #for Gaussian Metrix Model

from sklearn.metrics import silhouette_score
import time

import matplotlib.pyplot as plt

# Import the dataset (dataset of 0 to 10000 entries)

In [2]:
dataset = pd.read_csv("cleaned_dataset_1000.csv")

In [3]:
dataset.head(30)

Unnamed: 0,time,latitude,longitude,depth,magnitude,magnitude type,location
0,2009-03-10 01:15:27 IST,30.1,83.54,33,3.1,ML,"316km NNW of Kathmandu, Nepal"
1,2009-03-10 04:58:45 IST,35.12,73.75,10,2.9,ML,"132km NNW of Gulmarg, Jammu and Kashmir, India"
2,2009-03-10 23:13:17 IST,23.4,69.5,10,3.1,ML,"139km NNE of Dwarka, Gujarat, India"
3,2009-03-11 08:19:48 IST,33.35,76.5,10,2.9,ML,"115km SSW of Alchi(Leh),Jammu & Kashmir,India"
4,2009-03-11 18:28:43 IST,10.93,91.89,10,5.1,MB,"119km SW of Portblair, Andaman and Nicobar isl..."
5,2009-03-11 22:08:44 IST,17.29,73.56,10,2.5,ML,"97km NW of Kolhapur, Maharashtra, India"
6,2009-03-12 21:51:36 IST,32.49,76.36,10,3.6,ML,"30km N of Dharamshala, Himachal Pradesh, India"
7,2009-03-13 06:04:18 IST,36.85,76.63,10,3.4,ML,"259km N of Kargil, Laddakh, India"
8,2009-03-13 16:49:56 IST,36.2,69.4,10,3.8,ML,"143km SW of Fayzabad, Afghanistan"
9,2009-03-14 02:12:12 IST,36.6,71.1,197,4.5,ML,"74km SE of Fayzabad, Afghanistan"


# Extract the features and define the range of 'k'

In [4]:
# Extract the features you want to cluster on
X = dataset[['depth', 'magnitude']]

# Define the range of k values you want to test for k-means clustering
k_values = range(2, 11)

# 1. K-means clustering

In [5]:
# Initialize empty list to store the Silhouette scores for each k value and time required
kmeans_scores = []
kmeans_times = []

# Evaluate the Silhouette score for each value of k for k-means clustering
for k in k_values:
    kmeans = KMeans(n_clusters=6)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    kmeans_scores.append(score)

# Print the Silhouette scores for k-means clustering
print("K-means scores:", kmeans_scores)

# Initialize the KMeans clustering algorithm
kmeans = KMeans(n_clusters=3)

# Measure the time required to fit the KMeans clustering algorithm
start_time = time.time()
kmeans.fit(X)
end_time = time.time()
elapsed_time = end_time - start_time
kmeans_times.append(elapsed_time)

# Print the time required for the KMeans clustering algorithm
print("KMeans clustering time:", kmeans_times)

K-means scores: [0.8773598869791129, 0.8773598869791129, 0.8773598869791129, 0.8775288177840372, 0.8775288177840372, 0.8775288177840372, 0.8773598869791129, 0.8773598869791129, 0.8773598869791129]
KMeans clustering time: [0.09020590782165527]


# 2. Hierarchical clustering

In [6]:
# Initialize empty list to store the Silhouette score for hierarchical clustering and time required
hierarchical_scores = []
hierarchical_times = []

# Evaluate the Silhouette score for agglomerative hierarchical clustering
agg_clustering = AgglomerativeClustering(n_clusters=6)
labels = agg_clustering.fit_predict(X)
score = silhouette_score(X, labels)
hierarchical_scores.append(score)

# Print the Silhouette score for agglomerative hierarchical clustering
print("Hierarchical clustering score:", hierarchical_scores)

# Measure the time required to fit the KMeans clustering algorithm
start_time = time.time()
agg_clustering.fit(X)
end_time = time.time()
elapsed_time = end_time - start_time
hierarchical_times.append(elapsed_time)

# Print the time required for the KMeans clustering algorithm
print("Hierarchical clustering time:", hierarchical_times)

Hierarchical clustering score: [0.868702293346086]
Hierarchical clustering time: [0.05208754539489746]


# 3. DBSCAN clustering

In [7]:
# Initialize empty list to store the Silhouette score for DBSCAN clustering and time required
dbscan_scores = []
dbscan_times = []

# Evaluate the Silhouette score for DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=6)
labels = dbscan.fit_predict(X)
score = silhouette_score(X, labels)
dbscan_scores.append(score)

# Print the Silhouette score for DBSCAN clustering
print("DBSCAN score:", dbscan_scores)

# Measure the time required to fit the KMeans clustering algorithm
start_time = time.time()
dbscan.fit(X)
end_time = time.time()
elapsed_time = end_time - start_time
dbscan_times.append(elapsed_time)

# Print the time required for the KMeans clustering algorithm
print("DBSCAN clustering time:", dbscan_times)

DBSCAN score: [0.5923958858182599]
DBSCAN clustering time: [0.02053093910217285]


# 4. Mean-shift clustering

In [8]:
# Initialize empty list to store the Silhouette score for mean-shift clustering and time required
mean_shift_scores = []
mean_shift_times = []

# Evaluate the Silhouette score for mean-shift clustering
mean_shift = MeanShift()
labels = mean_shift.fit_predict(X)
score = silhouette_score(X, labels)
mean_shift_scores.append(score)

# Print the Silhouette score for mean-shift clustering
print("Mean-shift clustering score:", mean_shift_scores)

# Measure the time required to fit the KMeans clustering algorithm
start_time = time.time()
mean_shift.fit(X)
end_time = time.time()
elapsed_time = end_time - start_time
mean_shift_times.append(elapsed_time)

# Print the time required for the KMeans clustering algorithm
print("mean_shift clustering time:", mean_shift_times)

Mean-shift clustering score: [0.8320908765649782]
mean_shift clustering time: [2.388662099838257]


# 5. OPTICS (Ordering Points To Identify the Clustering Structure)

In [9]:
# Initialize empty list to store the Silhouette score for OPTICS clustering and time required
optics_scores = []
optics_times = []

# Evaluate the Silhouette score for OPTICS clustering
optics = OPTICS(min_samples=6)
labels = optics.fit_predict(X)
score = silhouette_score(X, labels)
optics_scores.append(score)

# Print the Silhouette score for OPTICS clustering
print("OPTICS clustering score:", optics_scores)

# Measure the time required to fit the KMeans clustering algorithm
start_time = time.time()
optics.fit(X)
end_time = time.time()
elapsed_time = end_time - start_time
optics_times.append(elapsed_time)

# Print the time required for the KMeans clustering algorithm
print("optics clustering time:", optics_times)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


OPTICS clustering score: [0.6854441049605815]
optics clustering time: [1.3300151824951172]


  ratio = reachability_plot[:-1] / reachability_plot[1:]


# 6. HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise)

In [10]:
# Initialize empty list to store the Silhouette score for HDBSCAN clustering and time required
hdbscan_scores = []
hdbscan_times = []

# Evaluate the Silhouette score for HDBSCAN clustering
hdbscan = hdbscan.HDBSCAN(min_cluster_size=6)
labels = hdbscan.fit_predict(X)
score = silhouette_score(X, labels)
hdbscan_scores.append(score)

# Print the Silhouette score for HDBSCAN clustering
print("HDBSCAN clustering score:", hdbscan_scores)

# Measure the time required to fit the KMeans clustering algorithm
start_time = time.time()
hdbscan.fit(X)
end_time = time.time()
elapsed_time = end_time - start_time
hdbscan_times.append(elapsed_time)

# Print the time required for the KMeans clustering algorithm
print("HDBSCAN clustering time:", hdbscan_times)

HDBSCAN clustering score: [0.7182468668452815]
HDBSCAN clustering time: [0.05603814125061035]


# 7. Gaussian Mixture Model (GMM)

In [11]:
# Initialize the Gaussian Mixture Model algorithm and list to store time required to cluster 
gmm = GaussianMixture(n_components=6)
gmm_times = []

# Fit the Gaussian Mixture Model algorithm on your data
gmm.fit(X)

# Assign the cluster labels to your data points
labels = gmm.predict(X)

# Calculate the Silhouette score for the GMM clustering
silhouette_score = silhouette_score(X, labels)

# Print the Silhouette score for the GMM clustering
print("GMM Silhouette score:", silhouette_score)

# Measure the time required to fit the KMeans clustering algorithm
start_time = time.time()
gmm.fit(X)
end_time = time.time()
elapsed_time = end_time - start_time
gmm_times.append(elapsed_time)

# Print the time required for the KMeans clustering algorithm
print("GMM clustering time:", gmm_times)

GMM Silhouette score: 0.7314242029976433
GMM clustering time: [0.06091046333312988]
