In [26]:
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
from sklearn.metrics import jaccard_score
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity as cos_sim
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from scipy.stats import mode

In [27]:
# Reading data from CSV file
file_path = 'C:/Education/ASU/Semester_2/DM/My_HW/kmeans_data/data.csv'
data = pd.read_csv(file_path)
data
file_path2 = 'C:/Education/ASU/Semester_2/DM/My_HW/kmeans_data/label.csv'
labelData = pd.read_csv(file_path2)
true_labels = labelData.values

features = data.values

scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

In [28]:
# Defining Jaccard Similarity
def jaccard_similarity(a, b):
    intersection = np.sum(np.minimum(a, b))
    union = np.sum(np.maximum(a, b))
    return intersection / union if union != 0 else 0

In [29]:
# The K-means function
def kmeans(X, k, sim='euclidean', max_iters=1000, tol=1e-4):
    # Initialize centroids randomly
    centroids = X[np.random.choice(X.shape[0], k, replace=False)]
    
    iterations = 0  # Track the number of iterations
    
    prev_sse = np.inf  # Initialize SSE to infinity for the first iteration
    
    for iterations in range(max_iters):
        # Compute distances from data points to centroids
        if sim == 'euclidean':
            dist = pairwise_distances(X, centroids, metric='euclidean')
        elif sim == 'cosine':
            dist = 1 - cos_sim(X, centroids)
        elif sim == 'jaccard':
            dist = np.array([1 - jaccard_similarity(X[i], centroid) for i in range(len(X)) for centroid in centroids])
            dist = dist.reshape(len(X), k)

        # Assign each data point to the closest centroid
        labels = np.argmin(dist, axis=1)
        
        # Update centroids
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
        
        # Calculate SSE
        sse = np.sum([np.sum((X[labels == i] - new_centroids[i])**2) for i in range(k)])
        
        # Check convergence criteria
        if np.linalg.norm(new_centroids - centroids) < tol or sse > prev_sse:
            break
        
        previous_sse = sse  # Update SSE for the next iteration
        iterations += 1  # Increment iteration count
        
        # Update centroids for the next iteration
        centroids = new_centroids
    
    return centroids, labels, sse, iterations

# Example usage
k = len(labelData['7'].unique())
# Apply k-means with Euclidean similarity
centroids_euclidean, labels_euclidean, sse_euclidean, iters_euclidean = kmeans(features, k, sim='euclidean')

# Apply k-means with Cosine similarity
centroids_cosine, labels_cosine, sse_cosine, iters_cosine = kmeans(features, k, sim='cosine')

# Apply k-means with Jaccard similarity
centroids_jaccard, labels_jaccard, sse_jaccard, iters_jaccard = kmeans(features, k, sim='jaccard')

# Print the number of iterations taken for convergence
print("Iterations for convergence:")
print("Euclidean:", iters_euclidean)
print("Cosine:", iters_cosine)
print("Jaccard:", iters_jaccard)


Iterations for convergence:
Euclidean: 62
Cosine: 55
Jaccard: 36
