In [1]:
import numpy as np

# now define a function to read the fvecs file format of Sift1M dataset
def read_fvecs(fp):
    a = np.fromfile(fp, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy().view('float32')

wb = read_fvecs('./sift/sift_base.fvecs')  # 1M samples
# also get some query vectors to search with
xq = read_fvecs('./sift/sift_query.fvecs')
xq = xq[0].reshape(1, xq.shape[1])

wb.shape, xq.shape

((1000000, 128), (1, 128))

In [3]:
wb = wb[:500000]
d = wb.shape[1]
d

128

In [4]:
import faiss

# Composite Indexes
Composite Indexes are combination of different indexes placed top on one another which can make it beautifully optimal or reckless mess. 
Most Components in Faiss can be placed together - But that doesnot mean they should be placed together

In Faiss this can be acheived using Index Factory Easily

A composite index is built from any combination of:
- ```Vector transform```— a pre-processing step applied to vectors before indexing (PCA, OPQ).
- ```Coarse quantizer``` — rough organization of vectors to sub-domains (for restricting search scope, includes IVF, IMI, and HNSW).
- ```Fine quantizer``` — a finer compression of vectors into smaller domains (for compressing index size, such as PQ).
- ```Refinement``` — a final step at search-time which re-orders results using distance calculations on the original flat vectors. Alternatively, another index (non-flat) index can be used.

### Normal Method we do in until NOW


In [5]:

d = len(wb[0])  # dimension
nlist = 256
quantizer  = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist )
index.train(wb)

In [6]:
index.add(wb)

### Index Factory Method

In [7]:
index_f = faiss.index_factory(d, "IVF256,Flat")

In [8]:
index_f.train(wb)
index_f.add(wb)


In [9]:
k = 10
D, I = index.search(xq, k)  
I

array([[455537, 236647, 454263,  68299, 478814, 180955,  59844, 394507,
        486457, 237161]], dtype=int64)

In [10]:
k = 10
D_f, I_f = index_f.search(xq, k)  
I_f

array([[455537, 236647, 454263,  68299, 478814, 180955,  59844, 394507,
        486457, 237161]], dtype=int64)

In [11]:
if np.array_equal(I, I_f):
    print("Both indexes give the same results")

Both indexes give the same results


#### With that in mind — why should we care to learn how we use index_factory?

First, it can depend on personal preference. If you prefer the class-based index building approach, stick with it.

However, through using the index_factory we can greatly improve the elegance and clarity of our code

#### Lets take look at a simple complex index

In [None]:
d = wb.shape[1]
m = 32
nbits = 8
nlist = 256

# d now refers to shape of rotated vectors from OPQ (which are equal)
opq = faiss.OPQMatrix(d, m)

# coarse+fine quantizer steps 
vecs = faiss.IndexFlatL2(d)
sub_index = faiss.IndexIVFPQ(vecs, d, nlist, m, nbits)

# now we merge the preprocessing, coarse, and fine quantization steps
q = faiss.IndexPreTransform(opq, sub_index)

# we will add all of the previous steps to our final refinement step
index = faiss.IndexRefineFlat(q)

# train the index, and index vectors
index.train(wb)
index.add(wb)

In [16]:
D, I = index.search(xq, k = 5)

In [None]:

index = faiss.index_factory(d, "OPQ32,IVF256,PQ32x8,RefineFlat")
index.train(wb)
index.add(wb)

In [17]:
D_f, I_f = index.search(xq, k = 5)

In [18]:
print(I)
print(I_f)
if np.array_equal(I, I_f):
    print("Both indexes give the same results")

[[455537 236647 454263  68299  91348]]
[[455537 236647 454263  68299  91348]]
Both indexes give the same results


In [21]:

%%timeit
index.search(xq, k=5)

258 μs ± 5.92 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [22]:
%%timeit
index_f.search(xq, k=5)

37.9 μs ± 1.46 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [24]:
import os
def get_memory_size(index):
    faiss.write_index(index, "temp.index")
    file_size  = os.path.getsize("temp.index")
    os.remove("temp.index")
    return file_size

index_size, index_f_size = get_memory_size(index), get_memory_size(index_f)
print(f"Index size: {index_size} bytes")   
print(f"Index_f size: {index_f_size} bytes")

Index size: 276330065 bytes
Index_f size: 260133259 bytes
