In [1]:
# Q1, Q2, Q3
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_Jarcard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]

    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_Jarcard_similarity, 'Jarcard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25417050736.6164, 'Accuracy': 0.6101610161016101, 'Iterations': 56}, 'Cosine': {'SSE': 696.5396766140344, 'Accuracy': 0.598059805980598, 'Iterations': 45}, 'Jarcard': {'SSE': 3661.0351320784384, 'Accuracy': 0.5910591059105911, 'Iterations': 34}}


In [2]:
# Q4 - when there is no change in centroid position
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_Jarcard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_Jarcard_similarity, 'Jarcard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25318230726.596188, 'Accuracy': 0.5897589758975897, 'Iterations': 99}, 'Cosine': {'SSE': 686.3564371585464, 'Accuracy': 0.6305630563056306, 'Iterations': 18}, 'Jarcard': {'SSE': 3660.6493884527554, 'Accuracy': 0.6006600660066007, 'Iterations': 47}}


In [3]:
# Q4 - when the SSE value increases in the next iteration
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_Jarcard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_Jarcard_similarity, 'Jarcard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25573275367.152157, 'Accuracy': 0.5254525452545254, 'Iterations': 99}, 'Cosine': {'SSE': 697.1928287106697, 'Accuracy': 0.6461646164616461, 'Iterations': 34}, 'Jarcard': {'SSE': 3725.455508785065, 'Accuracy': 0.6302630263026303, 'Iterations': 24}}


In [4]:
# Q4 - when the maximum preset value (e.g., 100) of iteration is complete
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_Jarcard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('data.csv').values
y = pd.read_csv('label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_Jarcard_similarity, 'Jarcard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25434321571.960716, 'Accuracy': 0.6025602560256026, 'Iterations': 99}, 'Cosine': {'SSE': 684.9773589119252, 'Accuracy': 0.6259625962596259, 'Iterations': 99}, 'Jarcard': {'SSE': 3691.9428610767923, 'Accuracy': 0.5528552855285529, 'Iterations': 99}}
