In [1]:
from lsh_forest import LSHForest
import numpy as np
from time import perf_counter_ns


First, we index the data with LSH Forest

In [2]:
num_vec = 1_000_000
dim = 4
nbits = 8
num_hash_tables = 4
data = np.random.rand(num_vec, dim)

lsh_forest = LSHForest(nbits=nbits, dim=dim, num_hash_tables=num_hash_tables, distance_metric="cosine", data_ref=data)

print(f"Indexing {num_vec} vectors of dim {dim} with hash size of {nbits} bits across {num_hash_tables} hash tables...")
t0 = perf_counter_ns()
lsh_forest.index_data()
tf = perf_counter_ns()
print(f"Done in {(tf - t0) / 1e9} seconds")


Indexing 1000000 vectors of dim 4 with hash size of 8 bits across 4 hash tables...


Indexing table 1/4: 100%|██████████| 1000000/1000000 [00:02<00:00, 452757.62it/s]
Indexing table 2/4: 100%|██████████| 1000000/1000000 [00:02<00:00, 431617.60it/s]
Indexing table 3/4: 100%|██████████| 1000000/1000000 [00:02<00:00, 451288.09it/s]
Indexing table 4/4: 100%|██████████| 1000000/1000000 [00:02<00:00, 431440.41it/s]

Done in 9.069737947 seconds





Then we create an iterator that yields values sorted by the distance to the input vector

In [4]:
query_iter = lsh_forest.query_iter(data[0])
t0 = perf_counter_ns()
for idx, (vec_id, dist) in enumerate(query_iter):
    v = data[vec_id]
tf = perf_counter_ns()
print(f"Took {(tf - t0) / 1e9} seconds to iterate over all {num_vec} indexed vectors")

Took 2.717113106 seconds to iterate over all 1000000 indexed vectors


In [5]:
t0 = perf_counter_ns()
for vec_id in range(num_vec):
    v = data[vec_id]
tf = perf_counter_ns()
print(f"Took {(tf - t0) / 1e9} seconds to iterate over all {num_vec} vectors directly")

Took 0.080383585 seconds to iterate over all 1000000 vectors directly
