In [1]:
import numpy as np
import matplotlib.pyplot as plt
import progressbar

In [2]:
file_path = "Data"
files = [
    "docword.kos.txt",
    "docword.nips.txt",
    "docword.enron.txt",
    ]
max_k = 15
k_iters = 5

In [3]:
def npify(indices:set|list,
          arr_len:int):
    arr = np.zeros(arr_len)
    for index in indices:
        arr[index - 1] = 1
    return arr


def setify(arr):
    threshold = 0.5 * np.max(arr)
    indices = []
    for i in range(len(arr)):
        if arr[i] >= threshold:
            indices.append(i)
    return set(indices)

In [4]:
def jac_dist(v1:set,
             v2:set):    
    union = len(v1.union(v2))
    intersection = len(v1.intersection(v2))
    
    if union == 0:
        return 1
    
    jacc_dist = 1 - (intersection / union)
    return jacc_dist

In [5]:
def calc_inertia(data,
                 cluster_labels,
                 centroids):
    inertia = 0
    
    for i in range(len(data)):
        inertia += jac_dist(
            data[i],
            centroids[cluster_labels[i]]
            )
    
    return inertia

In [6]:
def custom_kmeans(data:list,
                  k:int,
                  dim:int,
                  max_iter=30,
                  past_centroids:list=None):
    """
    Custom K-Means implementation with a Jaccard Similarity Measure.

    Args:
        data: 
        k: The desired number of clusters.
        max_iter: Maximum number of iterations (default: 30).

    Returns:
        A tuple containing:
            cluster_labels: An array of cluster labels for each data point.
            centroids: The final centroids (cluster centers) after convergence.
            intertia: The inertia for the final centroids and clusters
    """    
    # Initialize centroids randomly
    rnd = np.random.choice(a=len(data),
                               size=k,
                               replace=False)
    centroids = []
    for i in range(k):
        centroids.append(data[rnd[i]])

    # Iterate for max_iter or until convergence
    cluster_labels = []
    mean_calc = [[0, np.zeros(dim)]] * len(centroids)
    for _ in range(max_iter):
        old_centroids = centroids.copy()

        # Assign data points to closest centroids
        for i in range(len(data)):
            new_cluster = 0
            for j in range(len(centroids)):
                if jac_dist(data[i], centroids[new_cluster]) \
                    > jac_dist(data[i], centroids[j]):
                    new_cluster = j
            
            cluster_labels.append(new_cluster)
            mean_calc[new_cluster] = [mean_calc[new_cluster][0] + 1,
                                      mean_calc[new_cluster][1] + npify(data[i], dim)]
                
        # Update centroids (mean of assigned points)
        centroids = []
        for cl in mean_calc:
            if cl[0] == 0:
                centroids.append(setify(cl[1]))
            else:    
                centroids.append(setify(cl[1] / cl[0]))

        # Check for convergence
        if np.all(np.array([jac_dist(c, oc)
                            for c in centroids
                            for oc in old_centroids]) < 1e-3):
            break
    
    # Update clustering after centroids have been updated
    for i in range(len(data)):
        new_cluster = 0
        
        for j in range(len(centroids)):
            if jac_dist(data[i], centroids[new_cluster]) \
                > jac_dist(data[i], centroids[j]):
                new_cluster = j
        
        cluster_labels.append(new_cluster)
    
    # Calculate Inertia            
    inertia = calc_inertia(
                data=data,
                cluster_labels=cluster_labels,
                centroids=centroids
                )

    return cluster_labels, centroids, inertia

In [7]:
def kmeans_pipeline(file:str,
                    max_k:int,
                    k_iters:int):
    
    source = open(f'{file_path}/{file}', 'r')
    D = int(next(source).strip())
    W = int(next(source).strip())
    NNZ = int(next(source).strip())
    data = []

    tmp = None
    for line in source:
        d, w, _ = list(map(int, line.strip().split()))
        if d > len(data):
            if tmp is not None:
                data.append(set(tmp))
            tmp = [w]
        else:
            tmp.append(w)
    data.append(set(tmp))

    # Sanity Check
    read_words = sum([len(doc) for doc in data])
    if read_words != NNZ:
        return "Failure: Data Read Improperly"
    
    file_name = file.split('.')[1].upper()
    
    widgets = [f'Clustering on {file_name}: ', progressbar.Percentage(), ' | ',
            progressbar.Timer(), ' | (', progressbar.ETA(), ') ']
    bar = progressbar.ProgressBar(
        maxval=(max_k - 1),
        widgets=widgets)\
            .start()
    
    k_inertia = []
    past_centroids = None
    for k in range(2, max_k + 1):
        min_inertia = 10 ** 6
        
        if k == 2:
            for _ in range(k_iters):
                cluster_labels, centroids, inertia = \
                    custom_kmeans(
                        data=data,
                        k=k,
                        dim=W
                        )

                if min_inertia > inertia:
                    min_inertia = inertia
                    past_centroids = centroids
        else:
            flag = True
            counter = 0
            while flag:
                cluster_labels, centroids, inertia = \
                    custom_kmeans(
                        data=data,
                        k=k,
                        dim=W,
                        )
                if min_inertia > inertia:
                    min_inertia = inertia
                    min_centroids = centroids
                
                if min_inertia <= k_inertia[-1]:
                    flag = False
                
                counter += 1
                if counter > k_iters:
                    flag = False
        
        bar.update(k - 2 + 1)
        k_inertia.append(min_inertia)
    
    print(f'Sparsity of Matrix on {file_name}: {NNZ / (D * W) * 100:.3f} %')
    print()
    
    return k_inertia            

In [8]:
logs = []
for file in files:
    k_inertia = kmeans_pipeline(
                    file=file,
                    max_k=max_k,
                    k_iters=k_iters)
    
    file_name = file.split('.')[1].upper()
    logs.append((file_name, k_inertia))

Clustering on KOS: 100% | Elapsed Time: 0:04:06 | (ETA:  0:00:00)              

Sparsity of Matrix on KOS: 1.491 %



Clustering on NIPS: 100% | Elapsed Time: 0:07:10 | (ETA:  0:00:00)             

Sparsity of Matrix on NIPS: 4.006 %



Clustering on ENRON: 100% | Elapsed Time: 1:05:02 | (ETA:  0:00:00)            

Sparsity of Matrix on ENRON: 0.331 %



In [9]:
for log in logs:
    file_name = log[0]
    k_inertia = log[1]
    plt.figure(figsize=(10, 4))
    plt.plot(list(range(2, max_k + 1)),
             k_inertia,
             marker='o', 
             linestyle='-')
    plt.title(f'Clustering on {file_name}')
    plt.xlabel('K')
    plt.ylabel('Inertia (Jaccard Similarity)')
    plt.ylim([max(min(k_inertia) - 0.01 * max(k_inertia), 0), 1.01 * max(k_inertia)])
    plt.savefig(f'Output/Random/{file_name}.png')
    plt.close()