Importing kmeans_saketh.py it is a python file which contains all the necessary functions for KMeans, mentioned in the github repository

In [1]:
import kmeans_saketh as km
import numpy as np
import scipy

In [2]:
df = km.loadCSV('/content/data.csv')
labels = km.loadCSV('/content/label.csv')

In [3]:
k = len(set(labels))
print(k)

10


Run K-means clustering with Euclidean, Cosine and Jaccard similarity. Compare the SSEs of Euclidean-K-means, Cosine-K-means, Jaccard-K-means

In [None]:
c_euclidean = km.kmeans(df,k,dist='euclidean')
c_cosine = km.kmeans(df,k,dist='cosine')
c_jaccard = km.kmeans(df,k,dist='jaccard')

In [None]:
print('SSE of Euclidean: ', c_euclidean['withinss'])
print('SSE of Cosine: ', c_cosine['withinss'])
print('SSE of Jaccard: ', c_jaccard['withinss'])

Compute the predictive accuracy of Euclidean-K-means, Cosine-K-means, Jaccard-K-means

In [None]:
from tqdm import tqdm
def label_clusters(clusters, labels, df):
    # Convert df tuples to list for indexing
    df_list = [list(instance) for instance in df]
    cluster_labels = []
    for cluster in clusters:
        if not cluster:  # Skip empty clusters
            continue
        # Find the most frequent label in each cluster
        label_counts = {}
        for instance in cluster:
            instance_label = labels[df_list.index(list(instance))]
            label_counts[instance_label] = label_counts.get(instance_label, 0) + 1
        most_frequent_label = max(label_counts, key=label_counts.get)
        cluster_labels.append(most_frequent_label)
    return cluster_labels

def cal_acc(clusters, cluster_labels, actual_labels, df):
    correct_assignments = 0
    total_assignments = 0
    df_list = [list(instance) for instance in df]
    for cluster_index, cluster in enumerate(clusters):
        for instance in cluster:
            instance_index = df_list.index(list(instance))
            actual_label = actual_labels[instance_index]
            predicted_label = cluster_labels[cluster_index]
            if actual_label == predicted_label:
                correct_assignments += 1
            total_assignments += 1
    accuracy = correct_assignments / total_assignments if total_assignments > 0 else 0
    return accuracy

# Ensure df and labels are properly prepared for processing
df_list = [list(instance) for instance in df]
labels_list = [label[0] for label in labels]

# Calculate accuracy for Euclidean
euclidean_c_labels = label_clusters(c_euclidean['clusters'], labels_list, df)
acc_euclidean = cal_acc(c_euclidean['clusters'], euclidean_c_labels, labels_list, df)

# Repeat for Cosine and Jaccard
cosine_c_labels = label_clusters(c_cosine['clusters'], labels_list, df)
acc_cosine = cal_acc(c_cosine['clusters'], cosine_c_labels, labels_list, df)

jaccard_c_labels = label_clusters(c_jaccard['clusters'], labels_list, df)
acc_jaccard = cal_acc(c_jaccard['clusters'], jaccard_c_labels, labels_list, df)

print('Accuracy of Euclidean:', acc_euclidean)
print('Accuracy of Cosine:', acc_cosine)
print('Accuracy of Jaccard:', acc_jaccard)

Setting up the stop criteria: “when there is no change in centroid position, when the SSE value increases in the next iteration, when the maximum preset value ( 100) of iteration is complete”, for Euclidean-K-means, Cosine-Kmeans, Jaccard-K-means.

In [None]:
from time import time

def run_condition(condition, df,k):

    euclidean_start = time()
    c_euclidean = km.kmeans(df,k,dist='euclidean',condition=condition)
    euclidean_time = time() - euclidean_start

    print("Name: Euclidean \t Time Taken: {} \t Iteration Count: {}".format(euclidean_time, c_euclidean['iterations']))

    cosine_start = time()
    c_cosine = km.kmeans(df,k,dist='cosine',condition=condition)
    cosine_time = time() - cosine_start

    print("Name: Cosine \t Time Taken: {} \t Iteration Count: {}".format(cosine_time, c_cosine['iterations']))

    jaccard_start = time()
    c_jaccard = km.kmeans(df,k,dist='jaccard',condition=condition)
    jaccard_time = time() - jaccard_start

    print("Name: Jaccard \t Time Taken: {} \t Iteration Count: {}".format(jaccard_time, c_jaccard['iterations']))

#Stopping Criteria

When there is no change in centroid position

In [None]:
run_condition('centroid',df,k)

When the SSE value increases in the next iteration

In [None]:
run_condition('sse',df,k)

When the maximum preset value (100) of iteration is complete

In [None]:
run_condition('iteration',df,k)

Compare the SSEs of Euclidean-K-means Cosine-K-means, Jarcard-K-means with respect to
the following three terminating conditions

In [None]:
def run_condition_sse(condition, df,k):

    c_euclidean = km.kmeans(df,k,dist='euclidean',condition=condition)

    print("Euclidean : {}".format(c_euclidean['withinss']))

    c_cosine = km.kmeans(df,k,dist='cosine',condition=condition)

    print("Cosine : {}".format(c_cosine['withinss']))

    c_jaccard = km.kmeans(df,k,dist='jaccard',condition=condition)

    print("Jaccard : {}".format(c_jaccard['withinss']))

The SSEs of Euclidean-K-means Cosine-K-means, Jarcard-K-means when there is no change in centroid position

In [None]:
run_condition_sse('centroid',df,k)

The SSEs of Euclidean-K-means Cosine-K-means, Jarcard-K-means when the SSE value increases in the next iteration

In [None]:
run_condition_sse('sse',df,k)

The SSEs of Euclidean-K-means Cosine-K-means, Jarcard-K-means when the maximum preset value (100) of iteration is complete

In [None]:
run_condition_sse('iteration',df,k)