# NearPy
---
## Download Dataset

In [None]:
%pip install numpy h5py requests NearPy

In [None]:
# import libraries
import numpy as np
import h5py
import os
import requests
import tempfile
import time

import nearpy

## Download Dataset
---

In [None]:
response = requests.get("http://ann-benchmarks.com/sift-128-euclidean.hdf5")
loc = "sift.hdf5"
with open(loc, 'wb') as f:
    f.write(response.content)

sift_h5py = h5py.File(loc, "r")

In [None]:
list(sift_h5py.keys())

In [None]:
dataset = sift_h5py['train']
queries = sift_h5py['test'][:100]
print(dataset.shape)
print(queries.shape)

In [None]:
print(sift_h5py.get("neighbors").shape)
print(sift_h5py.get("train").shape)
print(sift_h5py.get("test").shape)

## Building a NearPy Engine
---

In [None]:
# Dimension of our vector space
dimension = 128

In [None]:
normalized_dataset = dataset / np.linalg.norm(dataset, axis=1)[:, np.newaxis]

# Create a random binary hash with 10 bits
rbp = nearpy.hashes.RandomBinaryProjections('rbp', 10)

In [None]:
# Create a NearPy engine with pipeline configuration
engine = nearpy.Engine(dimension, lshashes=[rbp], distance=nearpy.distances.EuclideanDistance())

In [None]:
# Store base vectors in engine
for idx, vec in tqdm(enumerate(dataset)):
    engine.store_vector(vec, idx)

In [None]:
# Perform similarity search on each query vector
nearest_neighbors = []
start = time.time()
for qidx, query in enumerate(queries):
    nearest_neighbors.append(engine.neighbours(query))
end = time.time()

print("Time: ", end - start)

In [None]:
# Parameter comparison: hash bit length variation
bitlengths = range(10, 21)
times = []
for bitlength in bitlengths:
    print(f"Hash bucket length: {bitlength} bits")
    rbp = nearpy.hashes.RandomBinaryProjections('rbp', bitlength)
    engine = nearpy.Engine(dimension, lshashes=[rbp], distance=nearpy.distances.EuclideanDistance())
    for idx, vec in tqdm(enumerate(dataset)):
        engine.store_vector(vec, idx)
        
    nearest_neighbors = []
    start = time.time()
    for qidx, query in enumerate(queries):
        nearest_neighbors.append(engine.neighbours(query))
    end = time.time()
    print("Time: ", end - start)
    times.append(end-start)