In [1]:
import numpy as np
from tqdm import tqdm
import timeit
np.random.seed(1)

In [2]:
#
# LSH used to build LSH hash table
#

class LSH:
    def __init__(self, input_dim=3, hash_dim=1):
        self.planes = []
        for i in range(hash_dim):
            v = np.random.rand(input_dim)
            v_hat = v / np.linalg.norm(v)
            self.planes.append(v_hat)
    
        self.planes = np.matrix(self.planes)
        self.buckets = dict()
    
    # Returns LSH of a vector
    def hash(self, vector):
        hash_vector = np.where((self.planes @ vector) < 0, 1, 0)[0]
        
        hash_string = "".join([str(num) for num in hash_vector])
        return hash_string
    
    # Add vector to bucket
    def add(self, vector):
        hashed = self.hash(vector)
        
        if hashed in self.buckets:
            self.buckets[hashed].append(vector)
        else:
            self.buckets[hashed] = [vector]
    
    # Returns bucket vector is in
    def get(self, vector):
        hashed = self.hash(vector)
        
        if hashed in self.buckets:
            return self.buckets[hashed]
        
        return []
        

In [43]:
#
# Nearest Neighbour operations
#

class NN():
    
    # Returns Euclidean distance between vectors
    def _distance_(self, v1, v2):
        return np.linalg.norm(v1-v2)

    def _matrix_distance_(self, v1, matrix):
        v1 = np.array([v1])

        p1 = np.sum(v1**2, axis=1)[:, np.newaxis]
        p2 = np.sum(matrix**2, axis=1)
        p3 =  -2 * np.dot(v1, matrix.T)

        return p1 + p2 + p3
    
    # Returns v1's Nearest Neighbour in vectors
    def get_nn(self, v1, vectors):
        min_dist = float("inf")
        min_vec = None
        
        for v2 in vectors:
            dist = self._distance_(v1, v2)
            if dist < min_dist:
                min_dist = dist
                min_vec = v2
        
        return min_vec

    def get_k_nn(self, v1, vectors, k=10):
        dists = nn._matrix_distance_(v1, vectors)
        top_k = np.argsort(dists, axis=1)[0,:k]
            
#         print(len(top_k))
#         print(top_k[0])
#         print(vectors.shape)
        
        return vectors[top_k[:],:]
        

In [4]:
input_dim = 726
lsh = LSH(input_dim=input_dim, hash_dim=6)

In [45]:
nn = NN()

In [6]:
vectors = []

for i in tqdm(range(100000)):
    v = np.random.uniform(-1,1, [input_dim])
    lsh.add(v)
    vectors.append(v)

100%|██████████| 100000/100000 [00:06<00:00, 15959.66it/s]


In [48]:
#v1  = np.random.uniform(-1,1, [input_dim])

bucket_vectors = lsh.get(v1)

print("Full vector list size: " + str(len(vectors)))
print("Bucket Size: " + str(len(bucket_vectors)) + " ("+ str(100 * len(bucket_vectors)/len(vectors)) +"%)")

# Find Nearest Neighbour in entire dataset w/ execution time
starttime = timeit.default_timer()
nn1 = nn.get_nn(v1, vectors)
print("Actual Nearest Neighbour: "  + ", time: " + str(timeit.default_timer() - starttime))

# Find Nearest Neighbour in LSH hash bucket w/ execution time
starttime = timeit.default_timer()
nn2 = nn.get_nn(v1, bucket_vectors)
print("Bucket Nearest Neighbour: " + ", time: " + str(timeit.default_timer() - starttime))

# Find Nearest Neighbour in LSH hash bucket w/ execution time
matrix = np.array(bucket_vectors)
starttime = timeit.default_timer()
nn3 = nn.get_k_nn(v1, matrix, 1)
print("Matrix Bucket Nearest Neighbour: " + ", time: " + str(timeit.default_timer() - starttime))

print(nn._distance_(nn3,nn1))

Full vector list size: 100000
Bucket Size: 24621 (24.621%)
Actual Nearest Neighbour: , time: 0.9761296160004349
Bucket Nearest Neighbour: , time: 0.24653854200005298
Matrix Bucket Nearest Neighbour: , time: 0.058122972999626654
21.073012388401608


In [8]:
print(nn._distance_(nn1,nn2))
print(nn._distance_(v1,nn2))
print(nn._distance_(nn1,v1))

21.785424418271397
20.300550404032982
19.619099894158204


In [9]:
np.array(vectors).shape

(100000, 726)

In [42]:
### Testing the matrix euclidean distance


v = np.array(v1)
matrix = np.array(vectors)

starttime = timeit.default_timer()
dists = nn._matrix_distance_(v, matrix)
top_3 = np.argsort(dists, axis=1)[0,:1]
#print(top_3[0])
print(dists[:,top_3[:]])
print("Matrix distance time: " + str(timeit.default_timer() - starttime))


starttime = timeit.default_timer()
min_dist = float("inf")
min_vec = None
for v2 in vectors:
    dist = nn._distance_(v1, v2)
    if dist < min_dist:
        min_dist = dist
        min_vec = v2

print(min_dist ** 2)
print("Original distance time: " + str(timeit.default_timer() - starttime))

[[398.79042678]]
Matrix distance time: 0.24450569600048766
398.79042678111676
Original distance time: 1.089641475999997
