In [85]:
from VecSim import *
import numpy as np

dim = 100
num_elements = 100000
M = 16
efConstruction = 100
efRuntime = 100

# Create a hnsw index for vectors of 128 floats. Use 'L2' as the distance metric
hnswparams = HNSWParams()
hnswparams.M = M
hnswparams.efConstruction = efConstruction
hnswparams.initialCapacity = num_elements
hnswparams.efRuntime = efRuntime
hnswparams.dim = dim
hnswparams.type = VecSimType_FLOAT32
hnswparams.metric = VecSimMetric_L2

hnsw_index = HNSWIndex(hnswparams)


In [86]:
# Add 10k random vectors to the index
data = np.float32(np.random.random((num_elements, dim)))
vectors = []

for i, vector in enumerate(data):
    hnsw_index.add_vector(vector, i)
    vectors.append((i, vector))

print(f'Index size: {hnsw_index.index_size()}')

Index size: 100000


In [115]:
# Create a random query vector
query_data = np.float32(np.random.random((1, dim)))

# Create batch iterator for this query vector
batch_iterator = hnsw_index.create_batch_iterator(query_data)
returned_results_num = 0
accumulated_labels = []
total_time = 0

from scipy import spatial

# Sort distances of every vector from the target vector and get the actual order
dists = [(spatial.distance.euclidean(query_data, vec), key) for key, vec in vectors]
dists = sorted(dists)

In [121]:
# Get the next best results

start = time.time()
batch_size = 100
labels, distances = batch_iterator.get_next_results(batch_size, BY_SCORE)
total_time = time.time()-start

print (f'Results in rank {returned_results_num}-{returned_results_num+len(labels[0])} are: \n')
print (f'labels: {labels}')
print (f'scores: {distances}')

returned_results_num += len(labels[0])
accumulated_labels.extend(labels[0])


Results in rank 400-500 are: 

labels: [[85602 71886 38586 20840 16501  9839 97131 12163 19486 16846 69316 18421
  28723 82420   847 55222 78395 57627 47447 21985 71618 18576 37049 25449
  84838 20458 24685 90356 44151 39332 51179 30482 19220 34277  4330 68411
  86330 28165  2344 60693 85314 72024 48715 86599   452 58873 38753 10933
  33191 33220 77016 54614 38673 92305 76387 20629 22731 84288 57485 48644
  11157 15929 44771 24178  6345  9757 56028 15974 80509 75234 87195 21142
  21638 32903 41129 18624 85677 16031 97265 39388 58320 41818 93700 76566
  95609 99288 78509 63325 84756 91322 18325  3898 19954 30765   944 83791
  53090 77805 20235 86845]]
scores: [[11.139572  11.2511835 11.4095745 11.443998  11.448523  11.462155
  11.532776  11.589312  11.592493  11.612871  11.61525   11.637443
  11.6501465 11.689689  11.694485  11.695381  11.704545  11.720498
  11.909647  11.942846  11.968911  11.996417  12.042768  12.067334
  12.067562  12.079513  12.081314  12.082407  12.083362  12.08507

In [122]:
# Measure recall and time

keys = [key for _, key in dists[:returned_results_num]]
correct = len(set(accumulated_labels).intersection(set(keys)))

print(f'Total search time: {total_time}')
print(f'Recall for {returned_results_num} results in index of size {num_elements} with dim={dim} is: ', correct/returned_results_num)

Total search time: 0.0006246566772460938
Recall for 500 results in index of size 100000 with dim=100 is:  0.768


In [130]:
# Run batches until depleted
batch_iterator.reset()
returned_results_num = 0
batch_size = 20
start = time.time()
while(batch_iterator.has_next()):
    labels, distances = batch_iterator.get_next_results(batch_size, BY_ID)
    returned_results_num += len(labels[0])

print(f'Total results returned: {returned_results_num}\n')
print(f'Total search time: {time.time() - start}')

Total results returned: 99994

Total search time: 0.06823587417602539
