In [1]:
import faiss

In [2]:
import numpy as np 

In [3]:
rng = np.random.default_rng(100)
vectors = rng.random((10**5, 70), dtype=np.float32)

vectors.shape

(100000, 70)

In [4]:
data = vectors.copy()

In [10]:
rng = np.random.default_rng(100)
Xtrain = rng.random((5000, 70), dtype=np.float32) 

In [7]:
rng_query = np.random.default_rng(100)
query = rng_query.random((1, 70), dtype=np.float32)

In [29]:
# cosine similarity
np.argsort(vectors.dot(query.T).T / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(query)), axis= 1).squeeze().tolist()[::-1][:5]

[89273, 70829, 92082, 67825, 76497]

## Trial 1 : Using IndexFlatL2 as coarse quantizer and IVFPQ 

In [17]:
quantizer = faiss.IndexFlatL2(data.shape[1])
m = [7, 10 , 14] #=> reduceDimension  
nlists = [256 , 512 , 1024 , 2048 , 4096] #number of clusters
nbits = [8 , 11] #number of bits per vector (representative centroids count )

for i in m:
    for j in nlists:
        for k in nbits:
            index = faiss.IndexIVFPQ(quantizer,data.shape[1],j,i,k)
            index.train(Xtrain)
            index.add(data)
            index.nprobe = 10 
            D, I = index.search(query, 5)
            print("m = ",i,"nlists = " ,j , "nbits = " , k,"D = ",D,"I = ",I)
        # index.reset()
# index = faiss.IndexIVFPQ(quantizer,data.shape[1],256,7,8) #=> Quantizer , Dimension , clusters , reduceDimension , nbits

m =  7 nlists =  256 nbits =  8 D =  [[5.6489396 5.669654  5.6715555 5.801227  5.8683796]] I =  [[  333 67825 70829 92502 88749]]
m =  7 nlists =  256 nbits =  11 D =  [[5.6473274 5.792086  6.2040205 6.315798  6.3201523]] I =  [[95806 47841 74406 78122 67825]]
m =  7 nlists =  512 nbits =  8 D =  [[4.382058  4.726253  5.161772  5.269694  5.5039954]] I =  [[69332 69743 82419 67883 59467]]
m =  7 nlists =  512 nbits =  11 D =  [[5.162032  5.615609  5.655964  5.774118  5.8294487]] I =  [[69332 11590 71309 45729 52689]]
m =  7 nlists =  1024 nbits =  8 D =  [[4.1817346 4.343134  4.6996837 4.736382  4.893241 ]] I =  [[37014 67883 86578 73084 67825]]
m =  7 nlists =  1024 nbits =  11 D =  [[4.5497737 5.0774665 5.1846905 5.1952014 5.2746954]] I =  [[73084 67825 67883 95297 89435]]
m =  7 nlists =  2048 nbits =  8 D =  [[4.215405  4.320094  4.4152    4.6056566 4.609274 ]] I =  [[80268 23798 99141 61164 56683]]
m =  7 nlists =  2048 nbits =  11 D =  [[4.6056376 5.0779853 5.1461983 5.369644  5.4

In [16]:
%%timeit

dist_x , id_x = index.search(query,5)

226 µs ± 7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


- Comments: 
    1. The results are fair good 
    2. Speed is OK 
    3. Memory is OK

## Trial 2 : Using hnsw as a quantizer with IVFPQ

In [33]:
M = [22 ,25, 30]
nlists = [1024 , 2048 , 4096]
m = [14,35]

In [34]:
for i in M:
    quantizer = faiss.IndexHNSWFlat(data.shape[1], i)
    for k in nlists:
        for l in m:
            index = faiss.IndexIVFPQ(quantizer,data.shape[1],k,l,8)
            index.train(Xtrain)
            index.add(data)
            index.nprobe = 10
            D, I = index.search(query, 5)
            print("M = ",i , "nlists = " , k,"D = ",D,"I = ",I)

M =  22 nlists =  1024 D =  [[5.84519   5.9200983 5.978896  6.06269   6.270578 ]] I =  [[34003 89273 47350 46526 95192]]
M =  22 nlists =  1024 D =  [[6.0940623 6.4300823 6.516925  6.60669   6.6402855]] I =  [[89273 95806 76497 53097 47350]]
M =  22 nlists =  2048 D =  [[5.9816723 6.003456  6.3272185 6.3451176 6.3617845]] I =  [[56748 52027 34695 70829 53097]]
M =  22 nlists =  2048 D =  [[6.0062895 6.3869057 6.6166353 6.6439734 6.9253607]] I =  [[70829 52027 92082 53097 61812]]
M =  22 nlists =  4096 D =  [[5.619247  6.14412   6.1954055 6.404251  6.718397 ]] I =  [[67883 68130 52027 10574 74978]]
M =  22 nlists =  4096 D =  [[6.4599113 6.56174   6.596447  6.605397  6.707086 ]] I =  [[67883 92082 10574 91589 52027]]
M =  25 nlists =  1024 D =  [[5.212241 5.470519 5.486867 5.599713 5.640121]] I =  [[52027 69332 67883 20036 21470]]
M =  25 nlists =  1024 D =  [[6.1882477 6.3203797 6.3327575 6.3700933 6.4284353]] I =  [[52689 52027 67883 61641 95806]]
M =  25 nlists =  2048 D =  [[5.41528

In [43]:
%%timeit 

dists , ids = index.search(query,10)

234 µs ± 5.75 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


- Comments
    1. The results are Not good at all  
    2. Speed is slow 
    3. Memory is huge

## PQ

In [35]:
import nanopq

In [40]:
M = [7 , 10 , 14]

for i in M:
    pq = nanopq.PQ(M=i, verbose=False)
    pq.fit(Xtrain)
    X_code = pq.encode(data.astype(np.float32))
    dists = pq.dtable(query[0]).adist(X_code)
    print("M = ",i,"dists = ",dists)

M =  7 dists =  [10.137394 13.16754  10.689751 ... 11.191105  9.19523   8.612617]
M =  10 dists =  [12.007293 13.475139 12.842962 ... 10.204383  8.628861  9.543551]
M =  14 dists =  [11.440275 13.278863 12.229638 ... 10.683782  9.452681 10.140962]


- Comments:
    1. The results are Not good at all  
    2. Speed is ok
    3. Memory is very very good

## IVF_Flat

In [42]:
# Generating random data (1M vectors with dimension 70)
quantizer = faiss.IndexFlatL2(data.shape[1])
m = [7, 10 , 14, 35 ,70]
nlists = [256 , 512 , 1024 , 2048 , 4096]
for i in m:
    for j in nlists:
        index = faiss.IndexIVFFlat(quantizer,data.shape[1],j,faiss.METRIC_L2)
        index.train(Xtrain)
        index.add(data)
        index.nprobe = 5
        D, I = index.search(query, 5)
        print("m = ",i,"nlists = " ,j , "D = ",D,"I = ",I)
        # index.reset()
# index = faiss.IndexIVFPQ(quantizer,data.shape[1],256,7,8) #=> Quantizer , Dimension , clusters , reduceDimension , nbits


m =  7 nlists =  256 D =  [[6.290603  6.377116  6.56882   6.579968  6.8486156]] I =  [[67825 95806 10574 40420 83624]]
m =  7 nlists =  512 D =  [[6.317532  6.443162  6.7743034 6.8572416 6.97613  ]] I =  [[52689 67883 69332 79455 69743]]
m =  7 nlists =  1024 D =  [[6.290603  6.443162  6.4945564 6.56882   6.5782757]] I =  [[67825 67883 92082 10574 52027]]
m =  7 nlists =  2048 D =  [[6.5554543 6.87107   6.9177866 7.2246375 7.451375 ]] I =  [[99141 14373  9198 60163 25545]]
m =  7 nlists =  4096 D =  [[6.0848384 6.377116  6.419765  6.443162  6.5554543]] I =  [[89273 95806  6419 67883 99141]]
m =  10 nlists =  256 D =  [[6.290603  6.377116  6.56882   6.579968  6.8486156]] I =  [[67825 95806 10574 40420 83624]]
m =  10 nlists =  512 D =  [[6.317532  6.443162  6.7743034 6.8572416 6.97613  ]] I =  [[52689 67883 69332 79455 69743]]
m =  10 nlists =  1024 D =  [[6.290603  6.443162  6.4945564 6.56882   6.5782757]] I =  [[67825 67883 92082 10574 52027]]
m =  10 nlists =  2048 D =  [[6.5554543 6

- Comments:
    1. The results are not good (moderate)  --> can be tuned for better results
    2. Speed is ok
    3. Memory is ok
    4. Ease of construction 

## LSH

In [45]:
#let's try LSH from faiss
index = faiss.IndexLSH(data.shape[1], 70)
index.train(Xtrain)
index.add(data)
index.nprobe = 5
D, I = index.search(query, 5)
print("D = ",D,"I = ",I)

D =  [[6. 6. 6. 7. 7.]] I =  [[56430 64727 77647  1976  2172]]
