In [1]:
import numpy as np
import faiss
import sys
from matplotlib import pyplot

ModuleNotFoundError: No module named 'faiss'

In [None]:
# create an inverted index
nlist = 1024
m = 8
kbits = 8 # 2^5 = 32, 2^8 = 256
d = 64
coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, nlist, m, kbits)

In [None]:
# fill it in
xb = faiss.rand((10000, d), 1234)
index.train(xb)
index.add(xb)

In [None]:
dir(index.pq)

In [None]:
print("index.pq.d", index.pq.d)
print("index.pq.dsub", index.pq.dsub)
print("index.pq.ksub", index.pq.ksub)
print("index.pq.nbits", index.pq.nbits)
print("index.pq.M", index.pq.M)
# print("index.pq.centroids", index.pq.centroids)
print("index.pq.code_size", index.pq.code_size)
# print("index.pq.sdc_table", index.pq.sdc_table)
# print("index.pq.cp", index.pq.cp)
# print("index.pq.decode", index.pq.decode)
# print("index.pq.this", index.pq.this)
# print("index.pq.train", index.pq.train)
# print("index.pq.train_type", index.pq.train_type)

In [None]:
def get_centroids(index):
    pq = index.pq
    # read the PQ centroids
    cen = faiss.vector_to_array(pq.centroids)
    cen = cen.reshape(pq.M, pq.ksub, pq.dsub)
    
    return cen

In [None]:
# get PQ centroids
cen = get_centroids(index)
pq_cen = cen
pq_cen.shape

In [None]:
# if there is a pre-transform, you can also use
# invlists = faiss.extract_index_ivf(index).invlists
invlists = index.invlists

In [None]:
dir(invlists)

In [None]:
# invlist sizes histogram
bc = np.bincount([invlists.list_size(l) for l in range(invlists.nlist)])
pyplot.step(np.arange(bc.size), bc)
pyplot.xlabel('size of invlist')
pyplot.ylabel('nb of invlists')
pyplot.grid()

In [None]:
def get_invlist(invlists, l):
    """ returns the inverted lists content. 
    That the data is *not* copied: if the inverted index is deallocated or changes, accessing the array may crash.
    To avoid this, just clone the output arrays on output. """
    ls = invlists.list_size(l)
    list_ids = faiss.rev_swig_ptr(invlists.get_ids(l), ls)
    list_codes = faiss.rev_swig_ptr(invlists.get_codes(l), ls * invlists.code_size)
    return list_ids, list_codes

In [None]:
# get content of inverted list #123
list_ids, list_codes = get_invlist(invlists, 124)

In [None]:
l = 17
ls = invlists.list_size(l)
ls

In [None]:
invlists.get_ids(l)

In [None]:
list_ids = faiss.rev_swig_ptr(invlists.get_ids(l), ls)
print(list_ids)
print(type(list_ids[0]))

In [None]:
list_codes = faiss.rev_swig_ptr(invlists.get_codes(l), ls * invlists.code_size)
print(list_codes)
print(list_codes.size)
print(list_codes.reshape(-1, invlists.code_size))
print(type(list_codes[0]))
print(sys.getsizeof(list_codes))

In [None]:
dir(invlists)

In [None]:
invlists.get_codes(0)

In [None]:
# vector ids in the list
list_ids

In [None]:
# check that vectors 375 and 764 are indeed quantized to list 124
coarse_quantizer.assign(xb[[375, 764]], 1)

In [None]:
dir(coarse_quantizer)

In [None]:
coarse_quantizer.xb
coarse_cen = faiss.vector_to_array(coarse_quantizer.xb)
print("coarse_cen.shape", coarse_cen.shape) # 1024 * 64 = 65536
print("coarse_quantizer.ntotal", coarse_quantizer.ntotal)
print("coarse_quantizer.d", coarse_quantizer.d)


coarse_cen = coarse_cen.reshape(coarse_quantizer.ntotal, coarse_quantizer.d)
print(coarse_cen)
print(coarse_cen.shape)

In [None]:
# PQ codes stored in the inverted lists
list_codes.reshape(-1, invlists.code_size)

In [None]:
q = faiss.rand((1, d), 1234)
print(q)

In [None]:
vector, distance = index.search(q, k=50)
print("======= Distance ========\n", vector, "\n")
print("======= Vector ID ========\n", distance, "\n")

In [None]:
def distance_full_vec(v1, v2):
    assert len(v1) == len(v2)
    v_diff = v1 - v2
    dist = np.sum(v_diff * v_diff)
    return dist

In [None]:
q_single = q[0]

dist_ls = []
# Compute the closest centroids
for i, c in enumerate(coarse_cen):
    dist = distance_full_vec(q_single, c)
    dist_ls.append((dist, i))

In [None]:
dist_ls.sort(key=lambda x: x[0])
dist_ls

In [None]:
# Verify the random vector with those Vector ID indeed has such distance
closest_cell = dist_ls[0][1]
list_ids, list_codes = get_invlist(invlists, closest_cell)

In [None]:
print(list_ids)
print(list_ids.shape)

In [None]:
list_codes = list_codes.reshape(-1, invlists.code_size)
list_codes

In [None]:
q_res = q_single - coarse_cen[closest_cell]
q_res

In [None]:
def construct_distance_table(q_res, pq_cen):
    M, ksub, dsub = pq_cen.shape
    dist_table = np.zeros(shape=(M, ksub))
    
    for m in range(M):
        q_res_part = q_res[m * dsub: (m + 1) * dsub]
        for k in range(ksub):
            PQ_cent = pq_cen[m][k]
            res_dist = PQ_cent - q_res_part
            dist = np.sum(res_dist * res_dist)
            
            dist_table[m, k] = dist
            
    return dist_table

In [None]:
dist_table = construct_distance_table(q_res, pq_cen)
print(dist_table)
print(dist_table.shape)

In [None]:
def estimate_distance(pq_code, dist_table):
    """
    given single PQ code and its distance table, compute the estimated distance
    """
    # pq_cen -> (pq.M 8, pq.ksub 256, pq.dsub D/M=64/8=8)
    M = dist_table.shape[0]
    assert M == len(pq_code)
    
    dist = 0
    for m in range(M):
        dist += dist_table[m][pq_code[m]]
        
    return dist

In [None]:
list_codes[0]

In [None]:
estimate_distance(list_codes[0], dist_table)

In [None]:
dist_cell

In [None]:
# parametersets = sys.argv[3:]
ps = faiss.ParameterSpace()
ps.initialize(index)

# for param in parametersets:
#         print(param, '\t', end=' ')
#         sys.stdout.flush()
#         ps.set_index_parameters(index, param)

param = ['nprobe=1']
ps.set_index_parameters(index, param)

In [None]:
vector, distance = index.search(q, k=50)
print("======= Distance ========\n", vector, "\n")
print("======= Vector ID ========\n", distance, "\n")