COMP-6651 - Algorithm Design Techniques

Project - Clustering Algorithms Analysis

OPTICS Experimentation

Author - Nitheesh Kumar Kambala - 40299620


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import OPTICS
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score, mutual_info_score, accuracy_score
from scipy.spatial.distance import cdist, pdist
from heapq import heappush, heappop
import os
import kagglehub

path_iris = kagglehub.dataset_download("himanshunakrani/iris-dataset")
f_path_iris = os.path.join(path_iris, 'iris.csv')
iris_df = pd.read_csv(f_path_iris)
iris_data_target = iris_df['species']
iris_data_features = iris_df.drop(columns=['species']).values

# Preprocessing for AI Global Index Dataset
def preprocess_ai_index(data):
    # Separate numerical and categorical columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    categorical_cols = data.select_dtypes(exclude=[np.number]).columns

    scaler = MinMaxScaler()
    num_scaled = scaler.fit_transform(data[numeric_cols]) if len(numeric_cols) > 0 else np.array([])

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cat_encoded = encoder.fit_transform(data[categorical_cols]) if len(categorical_cols) > 0 else np.array([])

    if num_scaled.size and cat_encoded.size:
        return np.hstack((num_scaled, cat_encoded))
    elif num_scaled.size:
        return num_scaled
    else:
        return cat_encoded

path_ai_index = kagglehub.dataset_download("katerynameleshenko/ai-index")
f_path_ai_index = os.path.join(path_ai_index, 'AI_index_db.csv')
ai_df = pd.read_csv(f_path_ai_index).dropna()
ai_data_target = ai_df['Cluster']
ai_data_features = ai_df.drop(columns=['Cluster'])
ai_data_features = preprocess_ai_index(ai_data_features)

# Load Global Earthquake Dataset
path_earthquakes= kagglehub.dataset_download("shreyasur965/recent-earthquakes")
f_path_earthquakes = os.path.join(path_earthquakes, 'earthquakes.csv')
earthquake_df = pd.read_csv(f_path_earthquakes)
earthquake_df = earthquake_df[['magnitude', 'felt', 'cdi','mmi','tsunami','sig','depth', 'latitude', 'longitude', 'alert']].dropna()
earthquake_data_target = earthquake_df['alert']
earthquake_data_features = earthquake_df.drop(columns=['alert'])
earthquake_data_features = StandardScaler().fit_transform(earthquake_data_features)

Downloading from https://www.kaggle.com/api/v1/datasets/download/himanshunakrani/iris-dataset?dataset_version_number=1...


100%|██████████| 0.98k/0.98k [00:00<00:00, 1.15MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/katerynameleshenko/ai-index?dataset_version_number=1...


100%|██████████| 2.38k/2.38k [00:00<00:00, 1.00MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/shreyasur965/recent-earthquakes?dataset_version_number=3...


100%|██████████| 214k/214k [00:00<00:00, 35.7MB/s]

Extracting files...





In [2]:
class BASE_OPTICS:
    def __init__(self, min_pts):
        self.min_pts = min_pts

    def fit(self, data):
        self.data = data
        n = len(data)
        self.reachability = np.full(n, np.inf)
        self.ordered_list = []
        self.processed = np.zeros(n, dtype=bool)

        for i in range(n):
            if not self.processed[i]:
                neighbors = self.region_query(i)
                self.processed[i] = True
                self.ordered_list.append(i)

                if len(neighbors) >= self.min_pts:
                    self.expand_cluster_order(i, neighbors)

        return self.ordered_list, self.reachability

    def region_query(self, point_idx):
        dists = cdist([self.data[point_idx]], self.data)[0]
        return np.where(dists <= self.eps(point_idx))[0]

    def eps(self, point_idx):
        dists = cdist([self.data[point_idx]], self.data)[0]
        sorted_dists = np.sort(dists)
        return sorted_dists[self.min_pts] if len(sorted_dists) > self.min_pts else np.inf

    def expand_cluster_order(self, point_idx, neighbors):
        priority_queue = []

        for neighbor in neighbors:
            if not self.processed[neighbor]:
                self.reachability[neighbor] = max(self.eps(point_idx), cdist([self.data[point_idx]], [self.data[neighbor]])[0][0])
                heappush(priority_queue, (self.reachability[neighbor], neighbor))

        while priority_queue:
            _, current = heappop(priority_queue)
            if not self.processed[current]:
                self.processed[current] = True
                self.ordered_list.append(current)
                new_neighbors = self.region_query(current)

                if len(new_neighbors) >= self.min_pts:
                    for neighbor in new_neighbors:
                        if not self.processed[neighbor]:
                            new_reach = max(self.eps(current), cdist([self.data[current]], [self.data[neighbor]])[0][0])
                            if new_reach < self.reachability[neighbor]:
                                self.reachability[neighbor] = new_reach
                                heappush(priority_queue, (new_reach, neighbor))

In [3]:
def sklearn_optics(data, min_pts):
    sklearn_optics = OPTICS(min_samples=min_pts, metric='euclidean')
    sklearn_optics.fit(data)
    return sklearn_optics.labels_, sklearn_optics.reachability_

In [4]:
from collections import Counter

def map_clusters_to_labels(cluster_labels, actual_labels):
    cluster_mapping = {}
    predicted_alerts = np.full_like(cluster_labels, fill_value="None", dtype=object)
    unique_clusters = np.unique(cluster_labels)
    for cluster in unique_clusters:
        if cluster == -1:
            continue
        cluster_indices = np.where(cluster_labels == cluster)[0]
        cluster_alerts = actual_labels[cluster_indices]
        most_common_alert = Counter(cluster_alerts).most_common(1)[0][0]
        cluster_mapping[cluster] = most_common_alert
        predicted_alerts[cluster_indices] = most_common_alert
    return cluster_mapping, predicted_alerts

def compute_clusters_mean_diameter(X, labels):
    diameters = []
    for cluster in np.unique(labels):
        if cluster == -1:
            continue
        points = X[labels == cluster]
        if len(points) < 2:
            diameters.append(0)
            continue
        dists = pdist(points)
        diameters.append(np.max(dists))
    return np.mean(diameters)

def compute_clusters_mean_splits(X, labels):
    splits = []
    for cluster in np.unique(labels):
        if cluster == -1:
            continue
        in_cluster = X[labels == cluster]
        out_cluster = X[labels != cluster]
        if len(out_cluster) == 0:
            splits.append(0)
            continue
        dists = cdist(in_cluster, out_cluster)
        splits.append(np.min(dists))
    return np.mean(splits)

def get_labels(data, ordered_list, reachability, eps_threshold):
    labels = np.full(len(data), -1)

    for i in range(len(data)):
        if reachability[i] <= eps_threshold:
            labels[i] = ordered_list[i]

    return labels

def compute_metrics(data, labels, target, predicted):
    if len(set(labels)) > 1:
        silhouette = silhouette_score(data, labels)
        davies_bouldin = davies_bouldin_score(data, labels)
        calinski_harabasz = calinski_harabasz_score(data, labels)
        ari = adjusted_rand_score(target, predicted)
        mi = mutual_info_score(target, predicted)
        accuracy = accuracy_score(target, predicted)
        mean_diameter = compute_clusters_mean_diameter(data, predicted)
        mean_splits = compute_clusters_mean_splits(data, labels)
    else:
        silhouette, davies_bouldin, calinski_harabasz,ari, mi, accuracy, mean_diameter, mean_split = None, None, None, None, None, None, None, None

    return {
        'Silhouette Score': silhouette,
        'Davies-Bouldin Index': davies_bouldin,
        'Calinski-Harabasz Index': calinski_harabasz,
        'ARI': ari,
        'MI': mi,
        'Mean Diameter': mean_diameter,
        'Mean Splits': mean_splits,
        'Accuracy': accuracy
    }

In [5]:
def optics_experimentation(dataset_name, features, target, min_pts, eps_threshold):
    print(f"\nProcessing {dataset_name} Dataset")

    # BASE OPTICS
    base_ordered_list, base_reachability = BASE_OPTICS(min_pts).fit(features)
    base_labels = get_labels(features, base_ordered_list, base_reachability, eps_threshold)
    base_cluster_mapping, base_predicted = map_clusters_to_labels(base_labels, target.values)
    base_metrics = compute_metrics(features, base_labels, target, base_predicted)
    print("BASE OPTICS Metrics:")
    for k, v in base_metrics.items():
        print(f"  {k}: {v:.4f}")

    # Scikit-learn OPTICS
    sklearn_ordered_list, sklearn_reachability = sklearn_optics(features, min_pts)
    sklearn_labels = get_labels(features, sklearn_ordered_list, sklearn_reachability, eps_threshold)
    sklearn_cluster_mapping, sklearn_predicted = map_clusters_to_labels(sklearn_labels, target.values)
    sklearn_metrics = compute_metrics(features, sklearn_labels, target, sklearn_predicted)
    print("Scikit-learn OPTICS Metrics:")
    for k, v in sklearn_metrics.items():
        print(f"  {k}: {v:.4f}")
    return base_labels, sklearn_labels

In [6]:
iris_base_labels, iris_sklearn_labels = optics_experimentation('Iris', iris_data_features, iris_data_target, 5, 1)


Processing Iris Dataset
BASE OPTICS Metrics:
  Silhouette Score: -0.0790
  Davies-Bouldin Index: 2.1060
  Calinski-Harabasz Index: 0.6453
  ARI: 0.8550
  MI: 1.0020
  Mean Diameter: 3.6180
  Mean Splits: 0.2262
  Accuracy: 0.9067
Scikit-learn OPTICS Metrics:
  Silhouette Score: -0.2268
  Davies-Bouldin Index: 2.7337
  Calinski-Harabasz Index: 15.4383
  ARI: 0.1526
  MI: 0.4013
  Mean Diameter: 2.8074
  Mean Splits: 0.2323
  Accuracy: 0.3867


In [7]:
ai_global_base_labels, ai_global_sklearn_labels = optics_experimentation('AI Global Index', ai_data_features, ai_data_target, 5, 2)


Processing AI Global Index Dataset
BASE OPTICS Metrics:
  Silhouette Score: -0.0705
  Davies-Bouldin Index: 1.0476
  Calinski-Harabasz Index: 0.7102
  ARI: 0.1166
  MI: 0.3779
  Mean Diameter: 2.2589
  Mean Splits: 1.4306
  Accuracy: 0.3710
Scikit-learn OPTICS Metrics:
  Silhouette Score: 0.0671
  Davies-Bouldin Index: 1.8726
  Calinski-Harabasz Index: 5.9671
  ARI: 0.0538
  MI: 0.1512
  Mean Diameter: 2.5514
  Mean Splits: 1.4298
  Accuracy: 0.3065


In [8]:
earthquake_base_labels, earthquake_sklearn_labels = optics_experimentation('Earthquake', earthquake_data_features, earthquake_data_target, 5, 1)


Processing Earthquake Dataset
BASE OPTICS Metrics:
  Silhouette Score: -0.2135
  Davies-Bouldin Index: 1.5069
  Calinski-Harabasz Index: 0.3043
  ARI: 0.2032
  MI: 0.1543
  Mean Diameter: 9.4189
  Mean Splits: 0.0666
  Accuracy: 0.6950
Scikit-learn OPTICS Metrics:
  Silhouette Score: -0.1555
  Davies-Bouldin Index: 1.4743
  Calinski-Harabasz Index: 6.2817
  ARI: 0.0532
  MI: 0.1103
  Mean Diameter: 9.9306
  Mean Splits: 0.3502
  Accuracy: 0.4110


### 4.1 Predicting alert-level based on clusters and analyzing how good the prediction is.

In [9]:
earthquake_base_labels, earthquake_sklearn_labels = optics_experimentation('Earthquake', earthquake_data_features, earthquake_data_target, 5, 1)


Processing Earthquake Dataset
BASE OPTICS Metrics:
  Silhouette Score: -0.2135
  Davies-Bouldin Index: 1.5069
  Calinski-Harabasz Index: 0.3043
  ARI: 0.2032
  MI: 0.1543
  Mean Diameter: 9.4189
  Mean Splits: 0.0666
  Accuracy: 0.6950
Scikit-learn OPTICS Metrics:
  Silhouette Score: -0.1555
  Davies-Bouldin Index: 1.4743
  Calinski-Harabasz Index: 6.2817
  ARI: 0.0532
  MI: 0.1103
  Mean Diameter: 9.9306
  Mean Splits: 0.3502
  Accuracy: 0.4110


### 4.2 Reducing the number of attributes to 5 using Mutual Information Scores between feature and alert level.

In [10]:
from sklearn.feature_selection import mutual_info_classif

X = earthquake_data_features
y = earthquake_data_target

# Compute MI scores
mi_scores = mutual_info_classif(X, y)

# Select top 5 features
top_5_features = np.array(np.arange(X.shape[1]))[mi_scores.argsort()[-5:]]

print("Top 5 features most predictive of 'alert':", earthquake_df.columns[top_5_features])

X_selected = X[:, top_5_features]

earthquake_data_top_5 = X_selected

Top 5 features most predictive of 'alert': Index(['magnitude', 'latitude', 'longitude', 'mmi', 'sig'], dtype='object')


### 4.3 Running clustering experimentation for both base and library implementation for top-5 attribute of earthquake data

In [11]:
base_labels, sklearn_labels = optics_experimentation('Earthquake', earthquake_data_top_5, earthquake_data_target, 5, 1)


Processing Earthquake Dataset
BASE OPTICS Metrics:
  Silhouette Score: -0.1604
  Davies-Bouldin Index: 1.4623
  Calinski-Harabasz Index: 0.5502
  ARI: 0.3622
  MI: 0.2020
  Mean Diameter: 4.0045
  Mean Splits: 0.0416
  Accuracy: 0.8102
Scikit-learn OPTICS Metrics:
  Silhouette Score: -0.0097
  Davies-Bouldin Index: 1.3400
  Calinski-Harabasz Index: 12.7160
  ARI: 0.1201
  MI: 0.1706
  Mean Diameter: 3.7755
  Mean Splits: 0.2677
  Accuracy: 0.5105
