In [1]:
from tqdm import tqdm
from bitstring import BitArray
import numpy as np
from collections import *

In [209]:
def fibonacci_hash_float(value:float):
    
    value = BitArray(float=value, length=64)
    phi = (1 + 5 ** 0.5) / 2
    g = int(2 ** 64 /phi)
    

    value ^= value >> 61
    value = int(g * value.float * np.random.random_sample())
 
    return value

In [116]:
def minhash(freqs, times, threshold = None):
    old_threshold = threshold
    picks = []
    out = []
    f = 0
    
    for i in range(0,threshold):
        picks = []
        for fr, tm in zip(freqs,times):
            picks.append(fibonacci_hash_float(fr) ^ fibonacci_hash_float(tm))
        
        out.append(np.min(picks))
        


    
    return out    

In [27]:
class HashTable:
    def __init__(self, hash_size, inp_dimensions):
        self.hash_size = hash_size
        self.inp_dimensions = inp_dimensions
        self.hash_table = dict()
        self.projections = np.random.randn(self.hash_size, inp_dimensions)
        
    def generate_hash(self, inp_vector):
        bools = (np.dot(inp_vector, self.projections.T) > 0).astype('int')
        return ''.join(bools.astype('str'))

    def setitem(self, inp_vec, label):
        hash_value = self.generate_hash(inp_vec)
        self.hash_table[hash_value] = self.hash_table.get(hash_value, list()) + [label]
        
    def getitem(self, inp_vec):
        hash_value = self.generate_hash(inp_vec)
        return self.hash_table.get(hash_value, [])
        
hash_table = HashTable(hash_size=4, inp_dimensions=20)

In [4]:
np.random.random_sample(20)

array([0.94986562, 0.92071818, 0.08748267, 0.46750966, 0.07163665,
       0.64865436, 0.23848227, 0.34934048, 0.13342551, 0.09856092,
       0.51274838, 0.30904344, 0.01825954, 0.79880908, 0.09458141,
       0.63409216, 0.09697352, 0.0934845 , 0.22691578, 0.74800618])

In [23]:
hash_table.generate_hash(np.random.random_sample(20))

'1011'

In [30]:
hash_table.setitem(np.random.random_sample(20), 1)

In [60]:
hash_table.getitem(np.random.random_sample(20))

[]

In [61]:
class LSH:
    def __init__(self, num_tables, hash_size, inp_dimensions):
        self.num_tables = num_tables
        self.hash_size = hash_size
        self.inp_dimensions = inp_dimensions
        self.hash_tables = list()
        for i in range(self.num_tables):
            self.hash_tables.append(HashTable(self.hash_size, self.inp_dimensions))
    
    def setitem(self, inp_vec, label):
        for table in self.hash_tables:
            table.setitem(inp_vec, label)
    
    def getitem(self, inp_vec):
        results = list()
        for table in self.hash_tables:
            results.extend(table.getitem(inp_vec))
        return list(set(results))

In [64]:
lsh = LSH(20, 6, 20)

In [73]:
lsh.setitem(np.random.random_sample(20), "test1")

In [74]:
lsh.getitem(np.random.random_sample(20))

['test2', 'test3', 'test1']