In [22]:
from lsh_forest import LSHForest
import numpy as np
from time import perf_counter_ns


First, we index the data with LSH Forest

In [23]:
num_vec = 1_000_000
dim = 5
nbits = 4
num_hash_tables = 5
data = np.random.rand(num_vec, dim)

lsh_forest = LSHForest(nbits=nbits, dim=dim, num_hash_tables=num_hash_tables, distance_metric="cosine", data_ref=data)

print(f"Indexing {num_vec} vectors of dim {dim} with hash size of {nbits} bits across {num_hash_tables} hash tables...")
t0 = perf_counter_ns()
lsh_forest.index_data()
tf = perf_counter_ns()
print(f"Done in {(tf - t0) / 1e9} seconds")


Indexing 1000000 vectors of dim 5 with hash size of 4 bits across 5 hash tables...
Done in 14.389468159 seconds


Then we create an iterator that yields values sorted by the distance to the input vector

In [24]:
query_iter = lsh_forest.query_iter(data[0])
for idx, (vec_id, dist) in enumerate(query_iter):
    print(format(dist, ".4f"), vec_id)
    if idx == num_vec / 10:  # Just show a few results
        break


1.0000 0
0.9999 571064
0.9997 85745
0.9997 529960
0.9997 845256
0.9997 98744
0.9997 622746
0.9997 444922
0.9995 71567
0.9995 55563
0.9995 287658
0.9995 792663
0.9995 903479
0.9994 813224
0.9994 26366
0.9994 283516
0.9994 345382
0.9993 678198
0.9993 164528
0.9993 342981
0.9993 825689
0.9993 223405
0.9993 542046
0.9993 45758
0.9993 287976
0.9993 166762
0.9992 237778
0.9992 849446
0.9992 620113
0.9992 277315
0.9992 931167
0.9992 164913
0.9992 68980
0.9992 860667
0.9992 602268
0.9991 249110
0.9991 501152
0.9991 497600
0.9991 595023
0.9991 223376
0.9991 745856
0.9991 380324
0.9991 739943
0.9991 283164
0.9991 856440
0.9990 497413
0.9990 295997
0.9990 164575
0.9990 455043
0.9990 105632
0.9989 846262
0.9989 581934
0.9989 745449
0.9989 711961
0.9989 310749
0.9989 777491
0.9989 228284
0.9988 48735
0.9988 718721
0.9988 992605
0.9988 361527
0.9988 457107
0.9987 710566
0.9987 394263
0.9987 593316
0.9987 685665
0.9987 641860
0.9987 95271
0.9986 143834
0.9986 545956
0.9986 541639
0.9986 759766
0.9986