In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score, mutual_info_score, accuracy_score, normalized_mutual_info_score
from scipy.spatial.distance import pdist, squareform, cdist
import os
import kagglehub

In [29]:
path_iris = kagglehub.dataset_download("himanshunakrani/iris-dataset")
f_path_iris = os.path.join(path_iris, 'iris.csv')
iris_df = pd.read_csv(f_path_iris)
# Extract labels (species)
iris_labels = iris_df['species']
# Remove label from the features
iris_features = iris_df.drop(columns=['species'])
# Convert to numpy
iris_data = iris_features.values


# Preprocessing for AI Global Index Dataset
def preprocess_ai_index(data):
    # Separate numerical and categorical columns
    numeric_cols = data.select_dtypes(include=[np.number]).columns
    categorical_cols = data.select_dtypes(exclude=[np.number]).columns

    scaler = MinMaxScaler()
    num_scaled = scaler.fit_transform(data[numeric_cols]) if len(numeric_cols) > 0 else np.array([])

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cat_encoded = encoder.fit_transform(data[categorical_cols]) if len(categorical_cols) > 0 else np.array([])

    if num_scaled.size and cat_encoded.size:
        return np.hstack((num_scaled, cat_encoded))
    elif num_scaled.size:
        return num_scaled
    else:
        return cat_encoded

path_ai_index = kagglehub.dataset_download("katerynameleshenko/ai-index")
f_path_ai_index = os.path.join(path_ai_index, 'AI_index_db.csv')
ai_df = pd.read_csv(f_path_ai_index)
ai_df = ai_df.dropna()
# Extract the 'Cluster' column as the label
ai_labels = ai_df['Cluster']
# Remove the 'Cluster' column from the features
ai_df_features = ai_df.drop(columns=['Cluster'])
# Preprocess the remaining features
ai_data = preprocess_ai_index(ai_df_features)

path_earthquakes= kagglehub.dataset_download("shreyasur965/recent-earthquakes")
f_path_earthquakes = os.path.join(path_earthquakes, 'earthquakes.csv')
earthquake_df = pd.read_csv(f_path_earthquakes)
earthquake_df = earthquake_df[['magnitude', 'felt', 'cdi','mmi','tsunami','sig','depth', 'latitude', 'longitude', 'alert']].dropna()
# Extract the alert labels
earthquake_data_alerts = earthquake_df['alert']
alert_encoded = LabelEncoder().fit_transform(earthquake_data_alerts)
# Remove the label from features
earthquake_data_features = earthquake_df.drop(columns=['alert'])
# Scale the numeric features
earthquake_data = StandardScaler().fit_transform(earthquake_data_features)

In [30]:
class BASE_KMEANS:
    def __init__(self, k, max_iters=100, tolerance=1e-4):
        self.k = k
        self.max_iters = max_iters
        self.tolerance = tolerance
        self.centroids = None
        self.labels = None

    def fit(self, data):
        self.data = data
        n_samples, n_features = data.shape
        random_indices = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = data[random_indices]
        prev_centroids = np.zeros_like(self.centroids)
        self.labels = np.zeros(n_samples, dtype=int)  # Ensure labels are integers

        for _ in range(self.max_iters):
            distances = np.linalg.norm(data[:, np.newaxis] - self.centroids, axis=2)
            self.labels = np.argmin(distances, axis=1)
            for i in range(self.k):
                cluster_points = data[self.labels == i]
                if len(cluster_points) > 0:
                    self.centroids[i] = np.mean(cluster_points, axis=0)
            if np.linalg.norm(self.centroids - prev_centroids) < self.tolerance:
                break
            prev_centroids = self.centroids.copy()
        return self.labels, self.centroids

In [31]:
def sklearn_kmeans(features, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(features)
    return labels

In [32]:
def compute_diameter(X, labels):
    """
    Computes the average 'diameter' across all clusters,
    where 'diameter' of a cluster is the maximum distance
    between any two points in that cluster.
    """
    unique_labels = np.unique(labels)
    diameters = []
    for lbl in unique_labels:
        cluster_points = X[labels == lbl]
        if len(cluster_points) > 1:
            # pairwise distances in the cluster
            dist_matrix = squareform(pdist(cluster_points))
            diameters.append(dist_matrix.max())
        else:
            # A single point has diameter 0
            diameters.append(0.0)
    return np.mean(diameters)


def compute_split(X, labels):
    """
    An example 'split' metric: ratio of the size of the largest cluster
    to the size of the smallest cluster. If there's only one cluster, return 1.
    """
    unique_labels, counts = np.unique(labels, return_counts=True)
    if len(counts) < 2:
        return 1.0
    return counts.max() / counts.min()


def evaluate_clustering(X, labels, true_labels=None):
    """
    Compute a set of metrics for the given clustering labels.
    Some metrics require ground truth (true_labels).
    If no true_labels is provided, ARI and MI will be omitted.
    Returns a dict of metric_name -> value.
    """
    metrics_dict = {}

    # Unsupervised metrics
    metrics_dict["Silhouette"] = silhouette_score(X, labels)
    metrics_dict["Davies-Bouldin"] = davies_bouldin_score(X, labels)
    metrics_dict["Calinski-Harabasz"] = calinski_harabasz_score(X, labels)
    metrics_dict["Diameter"] = compute_diameter(X, labels)
    metrics_dict["Split"] = compute_split(X, labels)

    # If we have ground-truth labels, we can compute supervised metrics
    if true_labels is not None:
        metrics_dict["Adjusted Rand Index"] = adjusted_rand_score(true_labels, labels)
        # Using normalized_mutual_info_score as a measure of MI
        metrics_dict["Mutual Information"] = normalized_mutual_info_score(true_labels, labels)
    return metrics_dict

In [33]:
def kmeans_experimentation(dataset_name, features, target, k):
    print(f"\nProcessing {dataset_name} Dataset")

    # Get labels from your BASE_KMEANS
    base_labels, _ = BASE_KMEANS(k).fit(features)

    # Evaluate your BASE_KMEANS using the evaluate_clustering function
    # base_metrics = evaluate_clustering(features, base_labels, true_labels=target.values)
    base_metrics = evaluate_clustering(features, base_labels, true_labels=target)
    print("BASE K-Means Metrics:")
    for t, v in base_metrics.items():
        print(f"  {t}: {v:.4f}")

    # Get labels from scikit-learn's KMeans
    sklearn_labels = sklearn_kmeans(features, k)

    # Evaluate scikit-learn's KMeans using the evaluate_clustering function
    # sklearn_metrics = evaluate_clustering(features, sklearn_labels, true_labels=target.values)
    sklearn_metrics = evaluate_clustering(features, sklearn_labels, true_labels=target)
    print("\nScikit-learn K-Means Metrics:")
    for t, v in sklearn_metrics.items():
        print(f"  {t}: {v:.4f}")

    return base_labels, sklearn_labels

In [40]:
base_labels_iris, sklearn_labels_iris = kmeans_experimentation("Iris", iris_data, iris_labels, 3)


Processing Iris Dataset
BASE K-Means Metrics:
  Silhouette: 0.5510
  Davies-Bouldin: 0.6664
  Calinski-Harabasz: 560.3660
  Diameter: 2.5210
  Split: 1.5641
  Adjusted Rand Index: 0.7163
  Mutual Information: 0.7419

Scikit-learn K-Means Metrics:
  Silhouette: 0.5526
  Davies-Bouldin: 0.6623
  Calinski-Harabasz: 560.3999
  Diameter: 2.5085
  Split: 1.6316
  Adjusted Rand Index: 0.7302
  Mutual Information: 0.7582


In [35]:
ai_global_base_labels, ai_global_sklearn_labels = kmeans_experimentation('AI Global Index', ai_data, ai_labels, 5)


Processing AI Global Index Dataset
BASE K-Means Metrics:
  Silhouette: 0.0589
  Davies-Bouldin: 2.3240
  Calinski-Harabasz: 7.6086
  Diameter: 2.4678
  Split: 2.4286
  Adjusted Rand Index: 0.0529
  Mutual Information: 0.2908

Scikit-learn K-Means Metrics:
  Silhouette: 0.1848
  Davies-Bouldin: 1.6109
  Calinski-Harabasz: 10.1389
  Diameter: 2.7197
  Split: 5.4000
  Adjusted Rand Index: 0.0016
  Mutual Information: 0.3006


In [36]:
earthquake_base_labels, earthquake_sklearn_labels = kmeans_experimentation('Earthquake', earthquake_data, alert_encoded, 4)


Processing Earthquake Dataset
BASE K-Means Metrics:
  Silhouette: 0.2978
  Davies-Bouldin: 1.2812
  Calinski-Harabasz: 158.7361
  Diameter: 13.1146
  Split: 2.9252
  Adjusted Rand Index: 0.0103
  Mutual Information: 0.0697

Scikit-learn K-Means Metrics:
  Silhouette: 0.3498
  Davies-Bouldin: 1.2806
  Calinski-Harabasz: 162.9608
  Diameter: 12.6598
  Split: 8.1930
  Adjusted Rand Index: 0.1056
  Mutual Information: 0.1820


### 4.1 Predicting alert-level based on clusters and analyzing how good the prediction is.

In [37]:
earthquake_base_labels, earthquake_sklearn_labels = kmeans_experimentation('Earthquake', earthquake_data, alert_encoded, 3)


Processing Earthquake Dataset
BASE K-Means Metrics:
  Silhouette: 0.3372
  Davies-Bouldin: 1.4329
  Calinski-Harabasz: 134.0585
  Diameter: 14.2742
  Split: 9.5385
  Adjusted Rand Index: 0.2854
  Mutual Information: 0.2385

Scikit-learn K-Means Metrics:
  Silhouette: 0.2626
  Davies-Bouldin: 1.5092
  Calinski-Harabasz: 151.4945
  Diameter: 15.1691
  Split: 3.0556
  Adjusted Rand Index: 0.0439
  Mutual Information: 0.1248


### 4.2 Reducing the number of attributes to 5 using Mutual Information Scores between feature and alert level.

In [38]:
from sklearn.feature_selection import mutual_info_classif

X = earthquake_data
y = alert_encoded

# Compute MI scores
mi_scores = mutual_info_classif(X, y)

# Select top 5 features
top_5_features = np.array(np.arange(X.shape[1]))[mi_scores.argsort()[-5:]]

print("Top 5 features most predictive of 'alert':", earthquake_df.columns[top_5_features])

X_selected = X[:, top_5_features]

earthquake_data_top_5 = X_selected

Top 5 features most predictive of 'alert': Index(['magnitude', 'latitude', 'longitude', 'mmi', 'sig'], dtype='object')


### 4.3 Running clustering experimentation for both base and library implementation for top-5 attribute of earthquake data

In [39]:
base_labels, sklearn_labels = kmeans_experimentation('Earthquake', earthquake_data, alert_encoded, 3)


Processing Earthquake Dataset
BASE K-Means Metrics:
  Silhouette: 0.3157
  Davies-Bouldin: 1.4366
  Calinski-Harabasz: 147.1532
  Diameter: 15.0971
  Split: 5.7849
  Adjusted Rand Index: 0.1795
  Mutual Information: 0.1857

Scikit-learn K-Means Metrics:
  Silhouette: 0.2626
  Davies-Bouldin: 1.5092
  Calinski-Harabasz: 151.4945
  Diameter: 15.1691
  Split: 3.0556
  Adjusted Rand Index: 0.0439
  Mutual Information: 0.1248
