In [None]:
from VecSim import *
import numpy as np

dim = 100
num_elements = 100000
M = 32
efConstruction = 200
efRuntime = 200

# Create a hnsw index for vectors of 100 floats. Use 'L2' as the distance metric
hnswparams = HNSWParams()
hnswparams.M = M
hnswparams.efConstruction = efConstruction
hnswparams.initialCapacity = num_elements
hnswparams.efRuntime = efRuntime
hnswparams.dim = dim
hnswparams.type = VecSimType_FLOAT32
hnswparams.metric = VecSimMetric_L2

hnsw_index = HNSWIndex(hnswparams)


In [None]:
# Add 100k random vectors and insert then to the index
data = np.float32(np.random.random((num_elements, dim)))
vectors = []

for i, vector in enumerate(data):
    hnsw_index.add_vector(vector, i)
    vectors.append((i, vector))

print(f'Index size: {hnsw_index.index_size()}')

In [None]:
# Create a random query vector
hnsw_index.set_ef(300)
query_data = np.float32(np.random.random((1, dim)))

# Create batch iterator for this query vector
batch_iterator = hnsw_index.create_batch_iterator(query_data)
returned_results_num = 0
accumulated_labels = []
total_time = 0

from scipy import spatial

# Sort distances of every vector from the target vector and get the actual order
dists = [(spatial.distance.euclidean(query_data, vec), key) for key, vec in vectors]
dists = sorted(dists)

In [None]:
# Get the next best results
import time

start = time.time()
batch_size = 100
labels, distances = batch_iterator.get_next_results(batch_size, BY_SCORE)
total_time += time.time()-start

print (f'Results in rank {returned_results_num}-{returned_results_num+len(labels[0])} are: \n')
print (f'scores: {distances}\n')
print (f'labels: {labels}')

returned_results_num += len(labels[0])
accumulated_labels.extend(labels[0])


In [None]:
# Measure recall and time

keys = [key for _, key in dists[:returned_results_num]]
correct = len(set(accumulated_labels).intersection(set(keys)))

print(f'Total search time: {total_time}')
print(f'Recall for {returned_results_num} results in index of size {num_elements} with dim={dim} is: ', correct/returned_results_num)

In [None]:
# Comapre to "stadnrd" KNN search

start = time.time()
labels_knn, distances_knn = hnsw_index.knn_query(query_data, returned_results_num)
print(f'Total search time: {time.time() - start}')

keys = [key for _, key in dists[:returned_results_num]]
correct = len(set(labels_knn[0]).intersection(set(keys)))
print(f'Recall for {returned_results_num} results in index of size {num_elements} with dim={dim} is: ', correct/returned_results_num)

In [None]:
# Run batches until depleted
batch_iterator.reset()
returned_results_num = 0
batch_size = 100
start = time.time()
while(batch_iterator.has_next()):
    labels, distances = batch_iterator.get_next_results(batch_size, BY_ID)
    returned_results_num += len(labels[0])

print(f'Total results returned: {returned_results_num}\n')
print(f'Total search time: {time.time() - start}')