In [None]:
import numpy as np
from scipy.cluster.vq import kmeans2, vq
from scipy.spatial.distance import cdist
import time

In [None]:
def PQ_train(vectors, M, k):
    s = int(vectors.shape[1] / M)                      # Dimension (or length) of a segment.
    codebook = np.empty((M, k, s), np.float32)         
        
    for m in range(M):
        sub_vectors = vectors[:, m*s:(m+1)*s]          # Sub-vectors for segment m.
        codebook[m], label = kmeans2(sub_vectors, k)   # Run k-means clustering for each segment.
        
    return codebook

In [None]:
def PQ_encode(vectors, codebook):
    M, k, s = codebook.shape
    PQ_code = np.empty((vectors.shape[0], M), np.uint8)
    
    for m in range(M):
        sub_vectors = vectors[:, m*s:(m+1)*s]           # Sub-vectors for segment m.
        centroid_ids, _ = vq(sub_vectors, codebook[m])  # vq returns the nearest centroid Ids.
        PQ_code[:, m] = centroid_ids                    # Assign centroid Ids to PQ_code.
        
    return PQ_code

In [None]:
def PQ_search(query_vector, codebook, PQ_code):
    M, k, s = codebook.shape
    #=====================================================================
    # Build the distance table.
    #=====================================================================
    
    distance_table = np.empty((M, k), np.float32)    # Shape is (M, k)    
    index= []
    for m in range(M):
        query_segment = query_vector[m*s:(m+1)*s]    # Query vector for segment m.
        distance_table[m] = cdist([query_segment], codebook[m], "sqeuclidean")[0]
        
    #=====================================================================
    # Look up the partial distances from the distance table.
    #=====================================================================
    
    N, M = PQ_code.shape
    distance_table = distance_table.T               # Transpose the distance table to shape (k, M)
    distances = np.zeros((N, )).astype(np.float32)

    for n in range(N):                              # For each PQ Code, lookup the partial distances.
        for m in range(M):
            distances[n] += distance_table[PQ_code[n][m]][m] # Sum the partial distances from all the segments.
    index =  np.argsort(distances)      
    return index, distances 

In [None]:
def PQ(points,queries):
  M = 8                     # Number of segments
  k = 256                   # Number of centroids per segment
  vector_dim = 128          # Dimension (length) of a vector
  total_vectors = 1000000   # Number of database vectors
  xb = points
  t1 = time.perf_counter()
  codebook = PQ_train(xb, M, k)
  t2 = time.perf_counter()
  PQ_code = PQ_encode(xb, codebook)
  t3 = time.perf_counter()
  time=0
  for i in range(queries.shape[0]):
    t4 = time.perf_counter()
    index, distances = PQ_search(queries[i], codebook, PQ_code)
    t5 = time.perf_counter()
    print('Top 10 nearest neighbours for queries:', index[0:10])
    print('Query search time:', t5-t4, 's')
    time+=(t5-t4)

  print('Codebook generation time:', t2-t1, 's')
  print('Encoding(PQ_code) time for dataset:', t2-t1, 's')
  print('Average query search time:', time/(queries.shape[0]))