COMP-6651 - Algorithm Design Techniques

Project - Clustering Algorithms Analysis

DBSCAN Experimentation

Author - Nitheesh Kumar Kambala - 40299620


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score, mutual_info_score, accuracy_score
from scipy.spatial.distance import cdist, pdist
import os
import kagglehub

path_iris = kagglehub.dataset_download("himanshunakrani/iris-dataset")
f_path_iris = os.path.join(path_iris, 'iris.csv')
iris_df = pd.read_csv(f_path_iris)
iris_data_target = iris_df['species']
iris_data_features = iris_df.drop(columns=['species']).values

# Preprocessing for AI Global Index Dataset
def preprocess_ai_index(data):
    # Separate numerical and categorical columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    categorical_cols = data.select_dtypes(exclude=[np.number]).columns

    scaler = MinMaxScaler()
    num_scaled = scaler.fit_transform(data[numeric_cols]) if len(numeric_cols) > 0 else np.array([])

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cat_encoded = encoder.fit_transform(data[categorical_cols]) if len(categorical_cols) > 0 else np.array([])

    if num_scaled.size and cat_encoded.size:
        return np.hstack((num_scaled, cat_encoded))
    elif num_scaled.size:
        return num_scaled
    else:
        return cat_encoded

path_ai_index = kagglehub.dataset_download("katerynameleshenko/ai-index")
f_path_ai_index = os.path.join(path_ai_index, 'AI_index_db.csv')
ai_df = pd.read_csv(f_path_ai_index).dropna()
ai_data_target = ai_df['Cluster']
ai_data_features = ai_df.drop(columns=['Cluster'])
ai_data_features = preprocess_ai_index(ai_data_features)

# Load Global Earthquake Dataset
path_earthquakes= kagglehub.dataset_download("shreyasur965/recent-earthquakes")
f_path_earthquakes = os.path.join(path_earthquakes, 'earthquakes.csv')
earthquake_df = pd.read_csv(f_path_earthquakes)
earthquake_df = earthquake_df[['magnitude', 'felt', 'cdi','mmi','tsunami','sig','depth', 'latitude', 'longitude', 'alert']].dropna()
earthquake_data_target = earthquake_df['alert']
earthquake_data_features = earthquake_df.drop(columns=['alert'])
earthquake_data_features = StandardScaler().fit_transform(earthquake_data_features)

Downloading from https://www.kaggle.com/api/v1/datasets/download/himanshunakrani/iris-dataset?dataset_version_number=1...


100%|██████████| 0.98k/0.98k [00:00<00:00, 356kB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/katerynameleshenko/ai-index?dataset_version_number=1...


100%|██████████| 2.38k/2.38k [00:00<00:00, 2.81MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/shreyasur965/recent-earthquakes?dataset_version_number=3...


100%|██████████| 214k/214k [00:00<00:00, 23.8MB/s]

Extracting files...





In [None]:
class BASE_DBSCAN:
    def __init__(self, eps, min_pts):
        self.eps = eps
        self.min_pts = min_pts

    def fit(self, data):
        self.data = data
        n = len(data)
        self.labels = np.full(n, -1)
        cluster_id = 0
        for i in range(n):
            if self.labels[i] == -1:
                neighbors = self.region_query(i)
                if len(neighbors) >= self.min_pts:
                    cluster_id += 1
                    self.expand_cluster(i, neighbors, cluster_id)
        return self.labels

    def region_query(self, point_idx):
        dists = cdist([self.data[point_idx]], self.data)[0]
        return np.where(dists <= self.eps)[0]

    def expand_cluster(self, point_idx, neighbors, cluster_id):
        self.labels[point_idx] = cluster_id
        i = 0
        while i < len(neighbors):
            neighbor_idx = neighbors[i]
            if self.labels[neighbor_idx] == -1:
                self.labels[neighbor_idx] = cluster_id
            elif self.labels[neighbor_idx] == 0:
                self.labels[neighbor_idx] = cluster_id
                new_neighbors = self.region_query(neighbor_idx)
                if len(new_neighbors) >= self.min_pts:
                    neighbors = np.append(neighbors, new_neighbors)
            i += 1

In [None]:
def sklearn_dbscan(data, eps, min_pts):
    model = DBSCAN(eps=eps, min_samples=min_pts)
    return model.fit_predict(data)

In [None]:
from collections import Counter

def map_clusters_to_labels(cluster_labels, actual_labels):
    cluster_mapping = {}
    predicted_alerts = np.full_like(cluster_labels, fill_value="None", dtype=object)
    unique_clusters = np.unique(cluster_labels)
    for cluster in unique_clusters:
        if cluster == -1:
            continue
        cluster_indices = np.where(cluster_labels == cluster)[0]
        cluster_alerts = actual_labels[cluster_indices]
        most_common_alert = Counter(cluster_alerts).most_common(1)[0][0]
        cluster_mapping[cluster] = most_common_alert
        predicted_alerts[cluster_indices] = most_common_alert
    return cluster_mapping, predicted_alerts

def compute_clusters_mean_diameter(X, labels):
    diameters = []
    for cluster in np.unique(labels):
        if cluster == -1:
            continue
        points = X[labels == cluster]
        if len(points) < 2:
            diameters.append(0)
            continue
        dists = pdist(points)
        diameters.append(np.max(dists))
    return np.mean(diameters)

def compute_clusters_mean_splits(X, labels):
    splits = []
    for cluster in np.unique(labels):
        if cluster == -1:
            continue
        in_cluster = X[labels == cluster]
        out_cluster = X[labels != cluster]
        if len(out_cluster) == 0:
            splits.append(0)
            continue
        dists = cdist(in_cluster, out_cluster)
        splits.append(np.min(dists))
    return np.mean(splits)

def compute_metrics(data, labels, target, predicted):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(data, labels)
        davies_bouldin = davies_bouldin_score(data, labels)
        calinski_harabasz = calinski_harabasz_score(data, labels)
        ari = adjusted_rand_score(target, predicted)
        mi = mutual_info_score(target, predicted)
        accuracy = accuracy_score(target, predicted)
        mean_diameter = compute_clusters_mean_diameter(data, predicted)
        mean_splits = compute_clusters_mean_splits(data, labels)
    else:
        silhouette, davies_bouldin, calinski_harabasz,ari, mi, accuracy, mean_diameter, mean_split = None, None, None, None, None, None, None, None

    return {
        'Silhouette Score': silhouette,
        'Davies-Bouldin Index': davies_bouldin,
        'Calinski-Harabasz Index': calinski_harabasz,
        'ARI': ari,
        'MI': mi,
        'Mean Diameter': mean_diameter,
        'Mean Splits': mean_splits,
        'Accuracy': accuracy
    }


In [None]:
def dbscan_experimentation(dataset_name, features, target, min_pts, eps):
    print(f"\nProcessing {dataset_name} Dataset")

    # BASE DBSCAN
    base_labels = BASE_DBSCAN(eps, min_pts).fit(features)
    #print(base_labels)
    base_cluster_mapping, base_predicted = map_clusters_to_labels(base_labels, target.values)
    base_metrics = compute_metrics(features, base_labels, target, base_predicted)
    print("BASE DBSCAN Metrics:")
    for k, v in base_metrics.items():
        print(f"  {k}: {v:.4f}")

    # Scikit-learn DBSCAN
    sklearn_labels = sklearn_dbscan(features, eps, min_pts)
    sklearn_cluster_mapping, sklearn_predicted = map_clusters_to_labels(sklearn_labels, target.values)
    #print(sklearn_labels)
    sklearn_metrics = compute_metrics(features, sklearn_labels, target, sklearn_predicted)
    print("Scikit-learn DBSCAN Metrics:")
    for k, v in sklearn_metrics.items():
        print(f"  {k}: {v:.4f}")
    return base_labels, sklearn_labels

In [None]:
iris_base_labels, iris_sklearn_labels = dbscan_experimentation('Iris', iris_data_features, iris_data_target, 5, 1)


Processing Iris Dataset
BASE DBSCAN Metrics:
  Silhouette Score: 0.2022
  Davies-Bouldin Index: 0.8335
  Calinski-Harabasz Index: 220.6209
  ARI: 0.7860
  MI: 0.8547
  Mean Diameter: 2.8768
  Mean Splits: 0.3391
  Accuracy: 0.9200
Scikit-learn DBSCAN Metrics:
  Silhouette Score: 0.6864
  Davies-Bouldin Index: 0.3836
  Calinski-Harabasz Index: 501.9249
  ARI: 0.5681
  MI: 0.6365
  Mean Diameter: 3.6342
  Mean Splits: 1.6401
  Accuracy: 0.6667


In [None]:
ai_global_base_labels, ai_global_sklearn_labels = dbscan_experimentation('AI Global Index', ai_data_features, ai_data_target, 5, 2)


Processing AI Global Index Dataset
BASE DBSCAN Metrics:
  Silhouette Score: 0.1011
  Davies-Bouldin Index: 1.8004
  Calinski-Harabasz Index: 7.1761
  ARI: 0.0420
  MI: 0.1517
  Mean Diameter: 2.5514
  Mean Splits: 2.0063
  Accuracy: 0.3226
Scikit-learn DBSCAN Metrics:
  Silhouette Score: 0.1011
  Davies-Bouldin Index: 1.8004
  Calinski-Harabasz Index: 7.1761
  ARI: 0.0420
  MI: 0.1517
  Mean Diameter: 2.5514
  Mean Splits: 2.0063
  Accuracy: 0.3226


In [None]:
earthquake_base_labels, earthquake_sklearn_labels = dbscan_experimentation('Earthquake', earthquake_data_features, earthquake_data_target, 5, 1)


Processing Earthquake Dataset
BASE DBSCAN Metrics:
  Silhouette Score: 0.0953
  Davies-Bouldin Index: 1.6416
  Calinski-Harabasz Index: 13.0675
  ARI: 0.3460
  MI: 0.1290
  Mean Diameter: 9.6661
  Mean Splits: 0.6016
  Accuracy: 0.8233
Scikit-learn DBSCAN Metrics:
  Silhouette Score: 0.1251
  Davies-Bouldin Index: 1.6527
  Calinski-Harabasz Index: 39.3070
  ARI: 0.2702
  MI: 0.0854
  Mean Diameter: 12.0946
  Mean Splits: 0.9709
  Accuracy: 0.8233


### 4.1 Predicting alert-level based on clusters and analyzing how good the prediction is.

In [None]:
earthquake_base_labels, earthquake_sklearn_labels = dbscan_experimentation('Earthquake', earthquake_data_features, earthquake_data_target, 5, 1)


Processing Earthquake Dataset
BASE DBSCAN Metrics:
  Silhouette Score: 0.0953
  Davies-Bouldin Index: 1.6416
  Calinski-Harabasz Index: 13.0675
  ARI: 0.3460
  MI: 0.1290
  Mean Diameter: 9.6661
  Mean Splits: 0.6016
  Accuracy: 0.8233
Scikit-learn DBSCAN Metrics:
  Silhouette Score: 0.1251
  Davies-Bouldin Index: 1.6527
  Calinski-Harabasz Index: 39.3070
  ARI: 0.2702
  MI: 0.0854
  Mean Diameter: 12.0946
  Mean Splits: 0.9709
  Accuracy: 0.8233


### 4.2 Reducing the number of attributes to 5 using Mutual Information Scores between feature and alert level.

In [None]:
from sklearn.feature_selection import mutual_info_classif

X = earthquake_data_features
y = earthquake_data_target

# Compute MI scores
mi_scores = mutual_info_classif(X, y)

# Select top 5 features
top_5_features = np.array(np.arange(X.shape[1]))[mi_scores.argsort()[-5:]]

print("Top 5 features most predictive of 'alert':", earthquake_df.columns[top_5_features])

X_selected = X[:, top_5_features]

earthquake_data_top_5 = X_selected

Top 5 features most predictive of 'alert': Index(['magnitude', 'latitude', 'longitude', 'mmi', 'sig'], dtype='object')


### 4.3 Running clustering experimentation for both base and library implementation for top-5 attribute of earthquake data

In [None]:
base_labels, sklearn_labels = dbscan_experimentation('Earthquake', earthquake_data_top_5, earthquake_data_target, 3, 1)


Processing Earthquake Dataset
BASE DBSCAN Metrics:
  Silhouette Score: 0.2500
  Davies-Bouldin Index: 1.2174
  Calinski-Harabasz Index: 108.2232
  ARI: 0.6649
  MI: 0.1728
  Mean Diameter: 3.8201
  Mean Splits: 0.6227
  Accuracy: 0.9490
Scikit-learn DBSCAN Metrics:
  Silhouette Score: 0.1018
  Davies-Bouldin Index: 1.1700
  Calinski-Harabasz Index: 60.3764
  ARI: 0.5611
  MI: 0.1397
  Mean Diameter: 3.7742
  Mean Splits: 1.2190
  Accuracy: 0.9411
