In [1]:
from __future__ import print_function
import os
import sys
import time
import numpy as np
import re
import faiss
from multiprocessing.dummy import Pool as ThreadPool
from matplotlib import pyplot

In [2]:
def mmap_fvecs(fname):
    x = np.memmap(fname, dtype='int32', mode='r')
    d = x[0]
    return x.view('float32').reshape(-1, d + 1)[:, 1:]


def mmap_bvecs(fname):
    x = np.memmap(fname, dtype='uint8', mode='r')
    d = x[:4].view('int32')[0]
    return x.reshape(-1, d + 4)[:, 4:]

def ivecs_read(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    return a.reshape(-1, d + 1)[:, 1:].copy()

dbname        = 'SIFT100M'
index_key     = 'OPQ16,IVF8192,PQ16'


tmpdir = '../trained_CPU_indexes/bench_cpu_{}_{}'.format(dbname, index_key)

if not os.path.isdir(tmpdir):
    raise("%s does not exist")


#################################################################
# Prepare dataset
#################################################################


print("Preparing dataset", dbname)

if dbname.startswith('SIFT'):
    # SIFT1M to SIFT1000M
    dbsize = int(dbname[4:-1])
    xb = mmap_bvecs('../bigann/bigann_base.bvecs')
    xq = mmap_bvecs('../bigann/bigann_query.bvecs')
    xt = mmap_bvecs('../bigann/bigann_learn.bvecs')

    # trim xb to correct size
    xb = xb[:dbsize * 1000 * 1000]

    gt = ivecs_read('../bigann/gnd/idx_%dM.ivecs' % dbsize)

elif dbname == 'Deep1B':
    xb = mmap_fvecs('../deep1b/base.fvecs')
    xq = mmap_fvecs('../deep1b/deep1B_queries.fvecs')
    xt = mmap_fvecs('../deep1b/learn.fvecs')
    # deep1B's train is is outrageously big
    xt = xt[:10 * 1000 * 1000]
    gt = ivecs_read('../deep1b/deep1B_groundtruth.ivecs')

else:
    print('unknown dataset', dbname, file=sys.stderr)
    sys.exit(1)


print("sizes: B %s Q %s T %s gt %s" % (
    xb.shape, xq.shape, xt.shape, gt.shape))

nq, d = xq.shape
nb, d = xb.shape
assert gt.shape[0] == nq


#################################################################
# Load Index
#################################################################

def get_populated_index():

    filename = "%s/%s_%s_populated.index" % (
        tmpdir, dbname, index_key)

    if not os.path.exists(filename):
        raise("Index does not exist!")
    else:
        print("loading", filename)
        index = faiss.read_index(filename)
    return index


index = get_populated_index()

Preparing dataset SIFT100M
sizes: B (100000000, 128) Q (10000, 128) T (100000000, 128) gt (10000, 1000)
loading ../trained_CPU_indexes/bench_cpu_SIFT100M_OPQ16,IVF8192,PQ16/SIFT100M_OPQ16,IVF8192,PQ16_populated.index


In [15]:
HBM_bank_num = int(21) 
cluster_num   = 8192
PQ_bytes      = 16

folder_name = 'FPGA_data_{}_{}_HBM_{}_banks'.format(dbname, index_key, HBM_bank_num)


if not os.path.exists('./saved_npy_data/' + folder_name):
    os.mkdir('./saved_npy_data/' + folder_name)

In [4]:
""" Part 1: Save index, quantizer, and HBM related contents """

' Part 1: Save index, quantizer, and HBM related contents '

In [5]:
""" Get OPQ Matrix """
linear_trans = faiss.downcast_VectorTransform(index.chain.at(0))
OPQ_mat = faiss.vector_to_array(linear_trans.A)
OPQ_mat = OPQ_mat.reshape((128,128))
OPQ_mat = np.array(OPQ_mat, dtype=np.float32)
print(OPQ_mat, OPQ_mat.shape)

[[-0.04185106  0.03428996  0.01463496 ...  0.26744068  0.08420503
  -0.12249091]
 [ 0.03530787  0.08281487 -0.05807371 ... -0.08664993 -0.05088573
  -0.24373505]
 [ 0.04926687  0.01517367 -0.00573028 ...  0.00100601  0.03905569
  -0.13516347]
 ...
 [ 0.10964511  0.08982475 -0.02009742 ... -0.0578454  -0.0549995
   0.04094499]
 [-0.12942526  0.17826557 -0.02329139 ... -0.0937097  -0.03088556
   0.12256143]
 [-0.1314367   0.14714412  0.01808932 ...  0.02709483 -0.03784304
  -0.10600516]] (128, 128)


In [6]:
""" Get IVF index (coarse quantizer) and product quantizer"""

downcasted_index = faiss.downcast_index(index.index)

def get_sub_quantizer_centroids(index):
    """
    return the sub-quantizer centroids, 
    shape = (m, 256, d / m)
    e.g., d=128, m=16 -> (16, 256, 8)
    """
    pq = index.pq
    cen = faiss.vector_to_array(pq.centroids)
    cen = cen.reshape(pq.M, pq.ksub, pq.dsub)
    
    return cen

def get_coarse_quantizer_centroids(index):
    """
    return the coarse-grained quantizer centroids,
    shape = (nlist, d),
    e.g., nlist=1024, d=128 -> (1024, 128)
    """
    coarse_quantizer = faiss.downcast_index(index.quantizer)
    coarse_cen = faiss.vector_to_array(coarse_quantizer.xb)

    coarse_cen = coarse_cen.reshape(coarse_quantizer.ntotal, coarse_quantizer.d)
    return coarse_cen

# Get Sub quantizer info
sub_cen = get_sub_quantizer_centroids(downcasted_index)
print("==== Sub-quantizer ====\n{}\n\nshape:{}\n".format(sub_cen, sub_cen.shape))

# Get Coarse quantizer info
coarse_cen = get_coarse_quantizer_centroids(downcasted_index)
print("==== Coarse-quantizer ====\n{}\n\nshape:{}\n".format(coarse_cen, coarse_cen.shape))

==== Sub-quantizer ====
[[[-30.197075   -23.236074   -13.366585   ... -11.393365     5.8771296
   -18.89725   ]
  [-33.022923   -15.39339      9.522758   ...  -8.383547    16.251562
    12.538808  ]
  [-14.867078    -2.4835837   -1.7066059  ...   3.1077738   19.915678
   -16.846704  ]
  ...
  [ 29.21712     13.204249     2.3176963  ...  16.067656   -24.286322
    30.081974  ]
  [ 28.375912    13.999012   -11.073188   ...  -4.282479   -21.07674
     6.6677904 ]
  [ 28.037931    42.105015     1.0285904  ...  35.699448    13.697935
   -14.478575  ]]

 [[ 20.051786    -8.370474     0.5161412  ...  12.963359   -15.200961
    27.656296  ]
  [ 15.461749   -10.884682     5.703733   ... -14.309333   -28.87946
     9.411468  ]
  [  6.2166452   -9.853486    16.709171   ...   9.081562   -18.439915
     7.429808  ]
  ...
  [-28.226736    26.75037    -23.633837   ...  -0.46722895  56.466084
   -13.462248  ]
  [-16.889128    30.844292    -5.331582   ...  17.001925    35.040524
   -19.33138   ]
  [-21

In [10]:
# Save the coarse quantizer and the product quantizer

OPQ_mat = np.array(OPQ_mat, dtype=np.float32)
PQ_quantizer = np.array(sub_cen, dtype=np.float32)
coarse_cen = np.array(coarse_cen, dtype=np.float32)

# 16, 256, 8 -> (0,0,0:8) the first row of the subquantizer of the first sub-vector 
print(OPQ_mat.shape, PQ_quantizer.shape, coarse_cen.shape)

OPQ_mat.tofile("./saved_npy_data/{}/OPQ_matrix_float32_{}_{}_raw".format(
    folder_name, OPQ_mat.shape[0], OPQ_mat.shape[1]))
PQ_quantizer.tofile("./saved_npy_data/{}/product_quantizer_float32_{}_{}_{}_raw".format(
    folder_name, PQ_quantizer.shape[0], PQ_quantizer.shape[1], PQ_quantizer.shape[2]))
coarse_cen.tofile("./saved_npy_data/{}/vector_quantizer_float32_{}_{}_raw".format(
    folder_name, coarse_cen.shape[0], coarse_cen.shape[1]))

(128, 128) (16, 256, 8) (8192, 128)


In [11]:
""" Get contents in a single Voronoi Cell """
invlists = downcasted_index.invlists

def get_invlist(invlists, l):
    """ 
    returns the (vector IDs set, PQ cose set) of list ID "l"
    list_ids: (#vec_in_list, ), e.g., #vec_in_list=10 -> (10, )
    list_codes: (#vec_in_list, m), e.g., #vec_in_list=10, m=16 -> (10, 16)
    
    That the data is *NOT* copied: if the inverted index is deallocated or changes, accessing the array may crash.
    To avoid this, just clone the output arrays on output. 
    """
    ls = invlists.list_size(l)
    list_vec_ids = faiss.rev_swig_ptr(invlists.get_ids(l), ls)
    list_PQ_codes = faiss.rev_swig_ptr(invlists.get_codes(l), ls * invlists.code_size)
    list_PQ_codes = list_PQ_codes.reshape(-1, invlists.code_size)
    return list_vec_ids, list_PQ_codes


In [12]:
# Example of using function "get_invlist"
list_id = 123
list_vec_ids, list_PQ_codes = get_invlist(invlists, list_id)
print("==== Vector IDs ====\n{}\n\nshape: {}\n".format(list_vec_ids, list_vec_ids.shape))
print("==== PQ codes ====\n{}\n\nshape: {}\n".format(list_PQ_codes, list_PQ_codes.shape))

==== Vector IDs ====
[    7322     8476     9861 ... 99992803 99993000 99998092]

shape: (22469,)

==== PQ codes ====
[[ 35 194 255 ...  87  90 174]
 [162 208  42 ...  66 146 239]
 [155 136 127 ... 170 216  62]
 ...
 [ 35 136 177 ...  91  25  42]
 [  3   3 173 ... 102  82  70]
 [ 71 135  12 ... 113  92  78]]

shape: (22469, 16)



In [13]:
def get_contents_to_HBM(invlists, cluster_id, HBM_bank_num=int(21)):
    """
    For a single cluster (list), extract the contents in the format that HBM loads
      inputs:
        invlists: the Faiss index.invlists object
        cluster_id: e.g., 0~8191 for nlist=8192
        HBM_bank_num: 21 for default, athough there are 32 banks on U280, 
                    we don't have enough hardware logic to load and compute at that rate
      outputs:
        HBM_bank_contents( content of 21 banks): a list of 21 element
            each element is a byte object with a set of contents
            the size of the content is m * 64 bytes
            the contents includes (3 * (int32 vector ID) (16 byte PQ code)) + 4byte padding
        entries_per_bank: int, all HBM shares the same number of 512-bit items to scan
        last_valid_element: int from 0 to 62 (63 numbers in total given 21 HBM channels)
            some of the elements in the last row are paddings, which of them is the last non-padding (valid) 
            
      term:
        entry: a 512-bit entry containing 3 PQ codes
        vector: a 20-byte vector containing 4 byte vector ID + 16 byte PQ code
    """
    
    list_vec_ids, list_PQ_codes = get_invlist(invlists, cluster_id)
#     print("list_vec_ids", list_vec_ids.shape)
#     print("list_PQ_codes", list_PQ_codes.shape)
    num_vec = list_vec_ids.shape[0]
    assert list_vec_ids.shape[0] == list_PQ_codes.shape[0]
    
#     print("num_vec", num_vec)
    
    if num_vec % (HBM_bank_num * 3) == 0:
        # no padding
        entries_per_bank = num_vec / (HBM_bank_num * 3)
        last_valid_element = HBM_bank_num * 3 - 1
        num_vec_per_HBM = [int(num_vec / HBM_bank_num)] * HBM_bank_num
        num_pad_per_HBM = [0] * HBM_bank_num
    else:
        # with padding
        entries_per_bank = int(num_vec / (HBM_bank_num * 3)) + 1
        last_valid_element = num_vec % (HBM_bank_num * 3) - 1
        num_vec_per_HBM = []
        num_pad_per_HBM = []
        
        counted_banks = 0
        # bank with full valid elements
        for i in range(int((last_valid_element + 1) / 3)):
            num_vec_per_HBM += [entries_per_bank * 3]
            num_pad_per_HBM += [0]
        counted_banks += int((last_valid_element + 1) / 3)
        
        # (optional) bank with some valid elements and some padding in the last entry
        if (last_valid_element + 1) % 3 != 0:
            num_vec_per_HBM += [(entries_per_bank - 1) * 3 + (last_valid_element + 1) % 3]
            num_pad_per_HBM += [3 - (last_valid_element + 1) % 3]
            counted_banks += 1
        
        # (optional) bank with full padding in the last entry
        for i in range(HBM_bank_num - counted_banks):
            num_vec_per_HBM += [int((entries_per_bank - 1) * 3)]
            num_pad_per_HBM += [3]
            
    assert np.sum(np.array(num_vec_per_HBM)) == num_vec
    assert entries_per_bank * HBM_bank_num * 3 - np.sum(np.array(num_pad_per_HBM)) == num_vec
    
    HBM_bank_contents = []
    
    start = int(0)
    
    zero = int(0)
    empty_byte = zero.to_bytes(1, "little", signed=True)
    
#     print("num_vec_per_HBM:", num_vec_per_HBM)
#     print("num_pad_per_HBM:", num_pad_per_HBM)
    
    for i in range(HBM_bank_num):
        
        # add valid vectors first
        end = start + num_vec_per_HBM[i]
        vec_per_bank_count = 0
        byte_obj = bytes()
        
#         print(start, end)
        
        for vec_id_per_bank in range(start, end):
            
            # Vec ID = signed int
            vec_id = int(list_vec_ids[vec_id_per_bank])
            # Xilinx's ap int use little endian
            # Linux on X86 use little endian
            # https://serverfault.com/questions/163487/how-to-tell-if-a-linux-system-is-big-endian-or-little-endian
            byte_obj += vec_id.to_bytes(4, "little", signed=True)
            
            # PQ code = unsigned char
            PQ_codes = list_PQ_codes[vec_id_per_bank]
            for code in PQ_codes:
                code = int(code)
                # Xilinx's ap int use little endian
                byte_obj += code.to_bytes(1, "little", signed=False)
            
            vec_per_bank_count += 1
            if vec_per_bank_count % 3 == 0:
                byte_obj += empty_byte * 4
        
        start = end
        
        # then add paddings
        if num_pad_per_HBM[i] > 0:
            for pad_id in range(num_pad_per_HBM[i]):
                byte_obj += empty_byte * 20
            byte_obj += empty_byte * 4
        
        HBM_bank_contents += [byte_obj]
       
    for i in range(HBM_bank_num):
        assert len(HBM_bank_contents[i]) == len(HBM_bank_contents[0])
        assert len(HBM_bank_contents[i]) == 64 * entries_per_bank
    
    return HBM_bank_contents, entries_per_bank, last_valid_element

In [16]:
# Get HBM contents from all clusters

list_HBM_bank_contents = [] # array of cluster_num * HBM_bank_num elements
list_entries_per_bank = []
list_last_valid_element = []

for c in range(cluster_num):
    HBM_bank_contents, entries_per_bank, last_valid_element = get_contents_to_HBM(invlists, c, HBM_bank_num)
    list_HBM_bank_contents += HBM_bank_contents
    list_entries_per_bank += [entries_per_bank]
    list_last_valid_element += [last_valid_element]

In [17]:
# Reorder list_HBM_bank_contents

print(len(list_HBM_bank_contents))
print("list_entries_per_bank:\n", list_entries_per_bank)
print("list_last_valid_element:\n", list_last_valid_element)

list_HBM_bank_contents_reordered = [] # put all contents of the same HBM bank together

for b in range(HBM_bank_num):
    sub_list = []
    for c in range(cluster_num):
        sub_list += [list_HBM_bank_contents[c * HBM_bank_num + b]]
    print(len(sub_list), len(sub_list[0]))
    list_HBM_bank_contents_reordered += [sub_list]
    
print("list_HBM_bank_contents_reordered:", len(list_HBM_bank_contents_reordered), len(list_HBM_bank_contents_reordered[0]))

172032
list_entries_per_bank:
 [144, 203, 143, 244, 246, 95, 214, 168, 250, 185, 217, 44.0, 157, 181, 202, 156, 212, 162, 203, 156, 161, 191, 91, 133, 166, 312, 241, 136, 183, 146, 189, 159, 172, 145, 113, 185, 133, 142, 247, 165, 128, 163, 670, 182, 243, 166, 54, 166, 193, 169, 198, 163, 180, 156, 223, 348, 354, 197, 224, 119, 187, 174, 177, 203, 133, 118, 217, 354, 187, 173, 180, 142, 298, 178, 161, 149, 163, 176, 222, 187, 159, 201, 279, 170, 138, 139, 152, 171, 189, 198, 514, 129, 487, 191, 317, 237, 175, 141, 120, 223, 218, 197, 196, 175, 199, 392, 182, 228, 149, 166, 203, 211, 131, 200, 149, 87, 159, 506, 281, 155, 148, 204, 132, 357, 131, 175, 233, 204, 194, 149, 180, 177, 160, 173, 147, 165, 316, 181, 196, 265, 98, 158, 185, 173, 168, 274, 289, 170, 135, 150, 126, 187, 177, 135.0, 189.0, 165, 126, 183, 197, 156, 158, 181, 211, 29, 229, 112, 140, 199, 160, 267, 176, 247, 188.0, 222, 151, 169, 137, 149, 162, 153, 157, 205, 178, 199, 181, 181, 119, 138, 210, 102, 234, 193, 159, 15

In [18]:
# Concatenate 

HBM_bank_contents_all = [bytes()] * HBM_bank_num # contents of each bank
for b in range(HBM_bank_num):
    HBM_bank_contents_all[b] = HBM_bank_contents_all[b].join(list_HBM_bank_contents_reordered[b])
    
total_size = np.sum(np.array([len(h) for h in HBM_bank_contents_all]))
print("HBM_bank_contents_all: shape: {}\tsize: {}".format(len(HBM_bank_contents_all), total_size))

HBM_bank_contents_all: shape: 21	size: 2138743488


In [19]:
# Save HBM contents 

for b in range(HBM_bank_num):
    assert len(HBM_bank_contents_all[b]) == len(HBM_bank_contents_all[0])

for b in range(HBM_bank_num):
    with open ('./saved_npy_data/{}/HBM_bank_{}_raw'.format(folder_name, b), 'wb') as f:
        f.write(HBM_bank_contents_all[b])

In [20]:
# Save control contents

#  The format of storing HBM_info_start_addr_and_scanned_entries_every_cell_and_last_element_valid: 
#     8192 start_addr, then 8192 scanned_entries_every_cell, then 8192 last_valid_element
#     int start_addr_LUT[nlist];
#     int scanned_entries_every_cell_LUT[nlist];
#     int last_valid_channel_LUT[nlist];  

list_start_addr_every_cell = [0]
for c in range(cluster_num - 1):
    list_start_addr_every_cell.append(list_start_addr_every_cell[c] + list_entries_per_bank[c])

assert len(list_start_addr_every_cell) == len(list_entries_per_bank) and\
    len(list_start_addr_every_cell) == len(list_last_valid_element)

print(list_start_addr_every_cell[-1])

HBM_info_start_addr_and_scanned_entries_every_cell_and_last_element_valid = \
    list_start_addr_every_cell + list_entries_per_bank + list_last_valid_element

HBM_info_start_addr_and_scanned_entries_every_cell_and_last_element_valid = np.array(
    HBM_info_start_addr_and_scanned_entries_every_cell_and_last_element_valid, dtype=np.int32)

HBM_info_start_addr_and_scanned_entries_every_cell_and_last_element_valid.tofile(
    "./saved_npy_data/{}/HBM_info_start_addr_and_scanned_entries_every_cell_and_last_element_valid_3_by_{}_raw".format(
        folder_name, cluster_num))

1591195.0


In [21]:
""" Part 2: Save results of different nprobe """

' Part 2: Save results of different nprobe '

In [23]:
#################################################################
# Perform searches and save results
#################################################################


parametersets = ['nprobe=17', 'nprobe=18', 'nprobe=32']

topK=10

ps = faiss.ParameterSpace()
ps.initialize(index)

# make sure queries are in RAM
xq = xq.astype('float32').copy()
xq = np.array(xq, dtype=np.float32)
xq.tofile("./saved_npy_data/{}/query_vectors_float32_{}_{}_raw".format(folder_name, xq.shape[0], xq.shape[1]))

# a static C++ object that collects statistics about searches
ivfpq_stats = faiss.cvar.indexIVFPQ_stats
ivf_stats = faiss.cvar.indexIVF_stats


# we do queries in a single thread
faiss.omp_set_num_threads(1)

print(' ' * len(parametersets[0]), '\t', 'R@10   time    %pass')
# print(' ' * len(parametersets[0]), '\t', 'R@1    R@10   R@100     time    %pass')

for param in parametersets:
    print(param, '\t', end=' ')
    sys.stdout.flush()
    ps.set_index_parameters(index, param)
    t0 = time.time()
    ivfpq_stats.reset()
    ivf_stats.reset()
    faiss_result_dist, faiss_result_idx = index.search(xq, topK)
    t1 = time.time()
    for rank in [topK]:
        n_ok = (faiss_result_idx[:, :rank] == gt[:, :1]).sum()
        print("%.4f" % (n_ok / float(nq)), end=' ')
    print("%8.3f  " % ((t1 - t0) * 1000.0 / nq), end=' ')
    print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivf_stats.ndis))
    
    D = np.array(faiss_result_dist, dtype=np.float32)
    I = np.array(faiss_result_idx, dtype=np.int32)
    print(D.shape, I.shape, xq.shape)
    D.tofile("./saved_npy_data/{}/result_nprobe_{}_distance_float32_{}_{}_raw".format(
        folder_name, param[-2:], D.shape[0], D.shape[1]))
    I.tofile("./saved_npy_data/{}/result_nprobe_{}_index_int32_{}_{}_raw".format(
        folder_name, param[-2:], I.shape[0], I.shape[1]))

          	 R@10   time    %pass
nprobe=17 	 0.8031    2.371    0.00
(10000, 10) (10000, 10) (10000, 128)
nprobe=18 	 0.8058    2.407    0.00
(10000, 10) (10000, 10) (10000, 128)
nprobe=32 	 0.8287    4.154    0.00
(10000, 10) (10000, 10) (10000, 128)
