Before we start, we need to get data. We will be using the Sift1M dataset. It can be downloaded and opened using this script:

In [1]:
import shutil
import urllib.request as request
from contextlib import closing

# first we download the Sift1M dataset
with closing(request.urlopen('ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz')) as r:
    with open('sift.tar.gz', 'wb') as f:
        shutil.copyfileobj(r, f)

In [2]:
import tarfile

# the download leaves us with a tar.gz file, we unzip it
tar = tarfile.open('sift.tar.gz', "r:gz")
tar.extractall()

  tar.extractall()


In [1]:
import numpy as np

# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

In [2]:
# data we will search through
wb = read_fvecs('./sift/sift_base.fvecs')  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
# take just one query (there are many in sift_learn.fvecs)
xq = xq[0].reshape(1, xq.shape[1])

In [5]:
wb.shape, xq.shape

((1000000, 128), (1, 128))

This is a 1 MILLION Sized Dataset, with vector dimension 128

So This will be very useful for learning on how FAISS deals with million level data

In [18]:
# Dimension D
D = wb.shape[1]
D


128

In [8]:
m = 8
assert D % m == 0, "D must be divisible by m"
# nbit value 11 is recommended for IVF-PQ in most of the Research papers
nbits = 8 # k = 2^nbits total number of centroids we have
k_ = 2 ** nbits // m  # number of centroids per sub-quantizer
k_


32

In [30]:
import faiss

index = faiss.IndexPQ(D, m, nbits)
index.is_trained


False

In [31]:
index.train(wb)

In [32]:
index.add(wb)

In [33]:
%%time
distance, I = index.search(xq, k = 100)

CPU times: total: 109 ms
Wall time: 15.8 ms


In [34]:
I.shape, distance.shape

((1, 100), (1, 100))

In [35]:
%%timeit
index.search(xq, k = 100)

4.41 ms ± 151 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Lets Compare it with IndexFlatL2

In [36]:
l2_index = faiss.IndexFlatL2(D)

In [37]:
l2_index.add(wb)

In [38]:

l2_distance, l2_I = l2_index.search(xq, k = 100)

In [40]:
sum([1 for i in I[0] if i in l2_I])

38

Now Let See the memore usage of those indexes for one miion data set

In [None]:
import os
def get_memory_size(index):
    faiss.write_index(index, "temp.index")
    file_size  = os.path.getsize("temp.index")
    os.remove("temp.index")
    return file_size

index_size, l2_index_size =get_memory_size(index), get_memory_size(l2_index)

In [44]:
index_size, l2_index_size

(8131158, 512000045)

In [46]:
# memory reduction percent
(l2_index_size - index_size ) /l2_index_size * 100

98.41188334270556

All of this is just PQ, now lets the same with IVFPQ

In [47]:
nlist = 256  # number of clusters
quantizer = faiss.IndexFlatL2(D)  # the other index
index = faiss.IndexIVFPQ(quantizer, D, nlist, m, nbits)
index.is_trained

False

In [48]:
index.train(wb)

In [49]:
index.add(wb)

In [50]:
Distance, I = index.search(xq, k = 100)

In [51]:
%%timeit
index.search(xq, k = 100)

43.1 μs ± 244 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [57]:
# Lets Compare the recall and memorey reduction with L2 index
sum([1 for i in I[0] if i in l2_I])

34

In [58]:
pq_index_size = get_memory_size(index)
pq_index_size, l2_index_size

(16264372, 512000045)

In [59]:
(l2_index_size - pq_index_size ) /l2_index_size * 100

96.82336512294643

In [55]:
# The accuracy is pretty bad..

In [56]:
# Now we increase the n probe value to make it search more clusters
index.nprobe = 16  # default nprobe is 1

Distance, I = index.search(xq, k = 100)

In [60]:
sum([1 for i in I[0] if i in l2_I])

34

In [62]:
# Now we increase the n probe value to make it search more clusters
index.nprobe = 25  # default nprobe is 1

Distance, I = index.search(xq, k = 100)
sum([1 for i in I[0] if i in l2_I])

# THE max result is not changing even if increase the nprobe value further.

34