In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
from sklearn.metrics import jaccard_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from scipy.stats import mode

In [None]:
# Reading data from CSV file
file_path = 'C:/Education/ASU/Semester_2/DM/My_HW/kmeans_data/data.csv'
data = pd.read_csv(file_path)
data
file_path2 = 'C:/Education/ASU/Semester_2/DM/My_HW/kmeans_data/label.csv'
labelData = pd.read_csv(file_path2)
true_labels = labelData.values

features = data.values

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

In [13]:
# Defining Jaccard Similarity
def jaccard_sim(a, b):
    intersection = np.sum(np.minimum(a, b))
    union = np.sum(np.maximum(a, b))
    return intersection / union if union != 0 else 0

In [None]:
# The K-means function
def kmeans(X, k, sim='euclidean', max_iters=100):
    centroids = X[np.random.choice(len(X), k, replace=False)]
    
    for _ in range(max_iters):
        if sim == 'euclidean':
            dist = pairwise_distances(X, centroids, metric='euclidean')
        elif sim == 'cosine':
            dist = 1 - cos_sim(X, centroids)
        elif sim == 'jaccard':
            dist = np.array([1 - jaccard_sim(X[i], centroid) for i in range(len(X)) for centroid in centroids])
            dist = dist.reshape(len(X), k)
        else:
            raise ValueError("Invalid similarity metric. Use 'euclidean', 'cosine', or 'jaccard'.")
        
        labels = np.argmin(dist, axis=1)
        new_centroids = np.array([X[labels == i].mean(axis=0) if np.sum(labels == i) > 0 else X[np.random.choice(len(X))] for i in range(k)])
        
        if np.all(new_centroids == centroids):
            break
        
        centroids = new_centroids
    
    sse = np.sum((X - centroids[labels]) ** 2)
    
    return centroids, labels, sse

# Apply k-means with Euclidean similarity
k = len(labelData['7'].unique())
centroids_euclidean, labels_euclidean, sse_euclidean = kmeans(features, k, sim='euclidean')

# Apply k-means with Cosine similarity
centroids_cosine, labels_cosine, sse_cosine = kmeans(features, k, sim='cosine')

# Apply k-means with Jaccard similarity
centroids_jaccard, labels_jaccard, sse_jaccard = kmeans(features, k, sim='jaccard')

In [15]:
# Print SSEs
print(f"SSE (Euclidean): {sse_euclidean}")
print(f"SSE (Cosine): {sse_cosine}")
print(f"SSE (Jaccard): {sse_jaccard}")

SSE (Euclidean): 25322440086.71399
SSE (Cosine): 25550469287.82114
SSE (Jaccard): 25411717194.158615
