In [5]:
import numba
import numpy as np
import math
from numba import jit, vectorize

In [3]:
import time

In [75]:
@njit(parallel=True)
def parallelized_kmeans(A, N, init_centroids, num_centers=5, iterations=20, D=10):
    
    centroids = init_centroids
    for l in prange(iterations):
        
        dist = np.array([[math.sqrt(np.sum((A[i,:]-centroids[j,:])**2)) for j in range(num_centers)]for i in range(N)])
        
        labels = np.array([dist[i,:].argmin() for i in range(N)])
        
        centroids = np.array([[np.sum(A[labels==i, j])/np.sum(labels==i) for j in range(D)]
                             for i in range(num_centers)])
    
    return centroids

In [79]:
def serial_kmeans(A, N, init_centroids, clusters=5, iterations=20, D=10):
    
    centroids = init_centroids
    for l in range(iterations):
        dist = np.array([[math.sqrt(np.sum((A[i,:]-centroids[j,:])**2)) for j in range(clusters)]for i in range(N)])
        labels = np.array([dist[i,:].argmin() for i in range(N)])
        centroids = np.array([[np.sum(A[labels==i, j])/np.sum(labels==i) for j in prange(D)]
                             for i in prange(clusters)])
        
    return centroids

In [88]:
def compare_time():
    
    startp, starts, endp, ends = 0,0,0,0
    
    size = 10000
    features = 10
    clusters = 5
    iterations = 20
    
    np.random.seed(0)
    initial_centroids = np.random.ranf((clusters, features))
    print("Initial Centroids : ",initial_centroids)
    print("\n")
    data = np.random.ranf((size, features))
    
    startp=time.time()
    k_means = parallelized_kmeans(A=data, N=size, init_centroids=initial_centroids)
    endp=time.time()
    
    print("Clusters (parallel) : \n",k_means)
    print("\nTime taken : {:.2f} seconds\n\n".format(endp-startp))
    
    starts=time.time()
    serial_k_means = serial_kmeans(A=data, N=size, init_centroids=initial_centroids)
    endp=time.time()
    
    print("Clusters (serial) : \n",serial_k_means)
    print("\nTime taken : {:.2f} seconds\n".format(endp-startp))
    
    if serial_k_means.sum() == k_means.sum():
        print("\nSuccessful execution")

In [89]:
compare_time()

Initial Centroids :  [[0.5488135  0.71518937 0.60276338 0.54488318 0.4236548  0.64589411
  0.43758721 0.891773   0.96366276 0.38344152]
 [0.79172504 0.52889492 0.56804456 0.92559664 0.07103606 0.0871293
  0.0202184  0.83261985 0.77815675 0.87001215]
 [0.97861834 0.79915856 0.46147936 0.78052918 0.11827443 0.63992102
  0.14335329 0.94466892 0.52184832 0.41466194]
 [0.26455561 0.77423369 0.45615033 0.56843395 0.0187898  0.6176355
  0.61209572 0.616934   0.94374808 0.6818203 ]
 [0.3595079  0.43703195 0.6976312  0.06022547 0.66676672 0.67063787
  0.21038256 0.1289263  0.31542835 0.36371077]]


Clusters (parallel) : 
 [[0.50758333 0.55082095 0.5341867  0.45886483 0.71012713 0.55054546
  0.51541065 0.62155111 0.77153917 0.44055847]
 [0.56929748 0.40531163 0.51547778 0.72803951 0.53497814 0.30943767
  0.35363885 0.44871313 0.49695484 0.68286129]
 [0.58941929 0.59676212 0.55367476 0.57397303 0.34211243 0.61523201
  0.5304572  0.67403468 0.28451154 0.4095317 ]
 [0.39955644 0.41143764 0.42314124