In [1]:
import hnswlib
import numpy as np 
import struct
import heapq
import time

In [2]:
def mmap_fvecs(fname):
    x = np.memmap(fname, dtype='int32', mode='r')
    d = x[0]
    return x.view('float32').reshape(-1, d + 1)[:, 1:]

def mmap_bvecs(fname):
    x = np.memmap(fname, dtype='uint8', mode='r')
    d = x[:4].view('int32')[0]
    return x.reshape(-1, d + 4)[:, 4:]

def ivecs_read(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    # Wenqi: Format of ground truth (for 10000 query vectors):
    #   1000(topK), [1000 ids]
    #   1000(topK), [1000 ids]
    #        ...     ...
    #   1000(topK), [1000 ids]
    # 10000 rows in total, 10000 * 1001 elements, 10000 * 1001 * 4 bytes
    return a.reshape(-1, d + 1)[:, 1:].copy()

def fvecs_read(fname):
    return ivecs_read(fname).view('float32')

In [3]:
dbname = 'SIFT1M'
index_path='../indexes/{}_index.bin'.format(dbname)
dim=128

if dbname.startswith('SIFT'):
    # SIFT1M to SIFT1000M
    dbsize = int(dbname[4:-1])
    xb = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_base.bvecs')
    xq = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_query.bvecs')
    gt = ivecs_read('/mnt/scratch/wenqi/Faiss_experiments/bigann/gnd/idx_%dM.ivecs' % dbsize)

    N_VEC = int(dbsize * 1000 * 1000)

    # trim xb to correct size
    xb = xb[:dbsize * 1000 * 1000]

    # Wenqi: load xq to main memory and reshape
    xq = xq.astype('float32').copy()
    xq = np.array(xq, dtype=np.float32)
    gt = np.array(gt, dtype=np.int32)

    print("Vector shapes:")
    print("Base vector xb: ", xb.shape)
    print("Query vector xq: ", xq.shape)
    print("Ground truth gt: ", gt.shape)
else:
    print('unknown dataset', dbname, file=sys.stderr)
    sys.exit(1)

Vector shapes:
Base vector xb:  (1000000, 128)
Query vector xq:  (10000, 128)
Ground truth gt:  (10000, 1000)


## Core HNSW load and search

In [4]:
def convertBytes(bytestring, dtype='int'):
    """
    convert bytes to a single element
    dtype = {int, long, float, double}
    struct: https://docs.python.org/3/library/struct.html
    """ 
    # int from bytes is much faster than struct.unpack
    if dtype =='int' or dtype == 'long': 
        return int.from_bytes(bytestring, byteorder='little', signed=False)
    elif dtype == 'float': 
        return struct.unpack('f', bytestring)[0]
    elif dtype == 'double': 
        return struct.unpack('d', bytestring)[0]
    else:
        raise ValueError 

# Wenqi: the fastest way to load a bytestring list is to use *** np.frombuffer ***
def convertBytesList(bytestring, dtype='int'):
    """
    Given a byte string, return the value list
    """
    result_list = []
    if dtype == 'int' or dtype == 'float':
        dsize = 4
    elif dtype == 'long' or dtype == 'double':
        dsize = 8
    else:
        raise ValueError 
        
    start_pointer = 0
    for i in range(len(bytestring) // dsize):
        result_list.append(convertBytes(
            bytestring[start_pointer: start_pointer + dsize], dtype=dtype))
        start_pointer += dsize
    return result_list

def calculateDist(query_data, db_vec):
    """
    HNSWLib returns L2 square distance, so do we
        both inputs are 1-d np array
    """
    # return l2 distance between two points
    return np.sum((query_data - db_vec) ** 2)


def merge_two_distance_list(list_A, list_B, k):
    """
    merge two lists by selecting the k pairs of the smallest distance
    input:
        both list has format [(dist, ID), (dist, ID), ...]
    return:
        a result list, with ascending distance (the first contains the largest distance)
    """
    
    results_heap = []
    for i in range(len(list_A)):
        dist, server_ID, vec_ID = list_A[i]
        heapq.heappush(results_heap, (-dist, server_ID, vec_ID))
    for i in range(len(list_B)):
        dist, server_ID, vec_ID = list_B[i]
        heapq.heappush(results_heap, (-dist, server_ID, vec_ID))

    while len(results_heap) > k:
        heapq.heappop(results_heap)

    results = []
    while len(results_heap) > 0:
        dist, server_ID, vec_ID = results_heap[0]
        results.append((-dist, server_ID, vec_ID))
        heapq.heappop(results_heap)
    results.reverse()
            
    return results

In [8]:
import struct 

        
class HNSW_index():
    
    """
    Returned result list always in the format of (dist, server_ID, vec_ID),
        in ascending distance order (the first result is the nearest neighbor)
    """
    
    def __init__(self, local_server_ID=0, dim=128):
        
        self.dim = dim
        self.local_server_ID = local_server_ID
        
        # Meta Info
        self.offsetLevel0_ = None
        self.max_elements_ = None
        self.cur_element_count = None
        self.size_data_per_element_ = None
        self.label_offset_ = None
        self.offsetData_ = None
        self.maxlevel_ = None
        self.enterpoint_node_ = None
        self.maxM_ = None
        self.maxM0_ = None
        self.M_ = None
        self.mult_ = None # the probability that a node is one a higher level
        self.ef_construction_ = None
        
        # ground layer, all with length of cur_element_count
        self.links_count_l0 = None # a list of link_count
        self.links_l0 = None # a list of links per vector
        self.data_l0 = None # a list of vectors
        label_l0 = None # a list of vector IDs
        
        # upper layers, all with length of cur_element_count
        self.element_levels_ = None # the level per vector
        self.links = None # the upper layer link info (link count + links)
        
        # remote nodes, order according to local ID (not label ID)
        #  remote_links: an 2-D array (cur_element_count, k), 
        #    each element is a tuple: (server_ID, vector_ID)
        self.remote_links_count = None
        self.remote_links = None
        
    def load_meta_info(self, index_bin):
        """
        index_bin = hnswlib index binary 
        
        HNSW save index order:
            https://github.com/WenqiJiang/hnswlib-eval/blob/master/hnswlib/hnswalg.h#L588-L616
        """
        self.offsetLevel0_ = int.from_bytes(index_bin[0:8], byteorder='little', signed=False)
        self.max_elements_ = int.from_bytes(index_bin[8:16], byteorder='little', signed=False)
        self.cur_element_count = int.from_bytes(index_bin[16:24], byteorder='little', signed=False)
        self.size_data_per_element_ = int.from_bytes(index_bin[24:32], byteorder='little', signed=False)
        self.label_offset_ = int.from_bytes(index_bin[32:40], byteorder='little', signed=False)
        self.offsetData_ = int.from_bytes(index_bin[40:48], byteorder='little', signed=False)
        self.maxlevel_ = int.from_bytes(index_bin[48:52], byteorder='little', signed=False)
        self.enterpoint_node_ = int.from_bytes(index_bin[52:56], byteorder='little', signed=False)
        self.maxM_ = int.from_bytes(index_bin[56:64], byteorder='little', signed=False)
        self.maxM0_ = int.from_bytes(index_bin[64:72], byteorder='little', signed=False)
        self.M_ = int.from_bytes(index_bin[72:80], byteorder='little', signed=False)
        self.mult_ = struct.unpack('d', index_bin[80:88])[0] # the probability that a node is one a higher level
        self.ef_construction_ = int.from_bytes(index_bin[88:96], byteorder='little', signed=False)
        

        print("offsetLevel0_", self.offsetLevel0_)
        print("max_elements_", self.max_elements_)
        print("cur_element_count", self.cur_element_count)
        print("size_data_per_element_", self.size_data_per_element_)
        print("label_offset_", self.label_offset_)
        print("offsetData_", self.offsetData_)
        print("maxlevel_", self.maxlevel_)
        print("enterpoint_node_", self.enterpoint_node_)
        print("maxM_", self.maxM_)
        print("maxM0_", self.maxM0_)
        print("M_", self.M_)
        print("mult_", self.mult_)
        print("ef_construction_", self.ef_construction_)
        
    
    def load_ground_layer(self, index_bin):
        """
        Get the ground layer vector ID, vectors, and links:
            links_count_l0: vec_num
            links_l0: maxM0_ * vec_num 
            data_l0: (dim, vec_num)
            label_l0: vec_num
        """
        
        # Layer 0 data 
        start_byte_pointer = 96
        delta = self.cur_element_count * self.size_data_per_element_
        data_level0 = index_bin[start_byte_pointer: start_byte_pointer + delta]
        
        size = len(data_level0)
        self.links_count_l0 = []
        self.links_l0 = np.zeros((self.cur_element_count, self.maxM0_), dtype=int)
        self.data_l0 = np.zeros((self.cur_element_count, self.dim))
        self.label_l0 = []

        data_l0_list = []
        
        assert len(data_level0) == self.size_data_per_element_ * self.cur_element_count
        
        size_link_count = 4
        size_links = self.maxM0_ * 4
        size_vectors = self.dim * 4
        size_label = 8
        
        assert self.size_data_per_element_ == \
            size_link_count + size_links + size_vectors + size_label
            
        for i in range(self.cur_element_count):
            # per ground layer node: (link_count (int), links (int array of len=maxM0_), 
            #    vector (float array of len=dim, vector ID (long)))
            
            addr_link_count = i * self.size_data_per_element_ 
            addr_links = addr_link_count + size_link_count
            addr_vectors = addr_links + size_links
            addr_label = addr_vectors + size_vectors
            
            tmp_bytes = data_level0[addr_link_count: addr_link_count + size_link_count]
            self.links_count_l0.append(convertBytes(tmp_bytes, dtype='int'))
        
            tmp_bytes = data_level0[addr_links: addr_links + size_links]
            self.links_l0[i] = np.frombuffer(tmp_bytes, dtype=np.int32)
            
            tmp_bytes = data_level0[addr_vectors: addr_vectors + size_vectors]
            self.data_l0[i] = np.frombuffer(tmp_bytes, dtype=np.float32)
            
            tmp_bytes = data_level0[addr_label: addr_label + size_label]
            self.label_l0.append(convertBytes(tmp_bytes, dtype='long'))


    def load_upper_layers(self, index_bin):
        """
        Get the upper layer info:
            element_levels_: the levels of each vector
            links: list of upper links
        """
        
        # meta + ground data
        start_byte_pointer = 96 + self.max_elements_ * self.size_data_per_element_
        
        # Upper layers
        links_count = 0
        size_links_per_element_ = self.maxM_ * 4 + 4
        self.element_levels_ = []
        self.links = []

        for i in range(self.cur_element_count):
            tmp_bytes = index_bin[start_byte_pointer:start_byte_pointer+4]
            linkListSize = convertBytes(tmp_bytes, dtype='int')
            start_byte_pointer += 4
            
            # if an element is only on ground layer, it has no links on upper layers at all
            if linkListSize == 0:
                self.element_levels_.append(0)
                self.links.append([])
            else:
                level = int(linkListSize / size_links_per_element_)
                self.element_levels_.append(level)
                tmp_bytes = index_bin[start_byte_pointer:start_byte_pointer+linkListSize]
                links_tmp = list(np.frombuffer(tmp_bytes, dtype=np.int32))
                start_byte_pointer += linkListSize
                links_count += linkListSize / 4;
                self.links.append(links_tmp)

        assert start_byte_pointer == len(index_bin) # 6606296

    def insertRemote(self, remote_hnswlib_indexes, remote_server_IDs, ef=128):
        """
        Input: 
            remote_hnswlib_indexes: a list of remote_hnswlib_index
                remote_hnswlib_index: index loaded by remote memory (hnswlib object)
            remote_server_IDs: a list of remote index IDs respective to remote_hnswlib_indexes
                e.g., this is server 1, and there are four servers in totol,
                    then the remote index IDs should be [0, 2, 3]
        """
        self.remote_links_count = []
        self.remote_links = []
        
        remote_I_list = []
        remote_D_list = []
        remote_server_ID_list = []
        
        k = self.maxM0_
        
        assert len(remote_hnswlib_indexes) == len(remote_server_IDs)
        
        # query all servers
        for i in range(len(remote_server_IDs)):
            remote_hnswlib_index = remote_hnswlib_indexes[i]
            remote_server_ID = remote_server_IDs[i]
        
            query = self.data_l0
            remote_hnswlib_index.set_ef(ef)
            I, D = remote_hnswlib_index.knn_query(query, k=k)
            remote_I_list.append(I)
            remote_D_list.append(D)
            remote_server_ID_list.append(
                np.ones((I.shape[0], I.shape[1]), dtype=np.int32) * int(remote_server_ID))
    
        # merge results per server
        remote_I = np.concatenate(remote_I_list, axis=1)
        remote_D = np.concatenate(remote_D_list, axis=1)
        remote_server_ID = np.concatenate(remote_server_ID_list, axis=1)
        
        D_server_ID_I_list = [[] for i in range(remote_I.shape[0])]
        server_ID_I_list= [[] for i in range(remote_I.shape[0])]
        for i in range(remote_I.shape[0]):
            for j in range(remote_I.shape[1]):
                D_server_ID_I_list[i].append((remote_D[i][j], remote_server_ID[i][j], remote_I[i][j]))
            D_server_ID_I_list[i].sort()
            D_server_ID_I_list[i] = D_server_ID_I_list[i][:k]
            server_ID_I_list[i] = [(s, i) for d, s, i in D_server_ID_I_list[i]]
                
        
        self.remote_links_count = [k for i in range(self.cur_element_count)]
        #  remote_links: an 2-D array (cur_element_count x k), 
        #    each element is a tuple: (server_ID, vector_ID)
        self.remote_links = server_ID_I_list
        

    def searchKnn(self, q_data, k, ef, debug=False):
        """
        result a list of (distance, vec_ID) in ascending distance
        """
        
        ep_node = self.enterpoint_node_
        num_elements = self.cur_element_count
        max_level = self.maxlevel_
        links_count_l0 = self.links_count_l0
        links_l0 = self.links_l0
        data_l0 = self.data_l0
        links = self.links
        label_l0 = self.label_l0
        dim = self.dim
        
        currObj = ep_node
        currVec = data_l0[currObj]
        curdist = calculateDist(q_data, currVec)
        
        search_path_local_ID = set()
        search_path_vec_ID = set()
        
        # search upper layers
        for level in reversed(range(1, max_level+1)):
            if debug:
                print("")
                print("level: ", level)
            changed = True
            while changed:
                if debug:
                    print("current object: ", currObj, ", current distance: ", curdist)
                search_path_local_ID.add(currObj)
                changed = False
                ### Wenqi: here, assuming Node ID can be used to retrieve upper links (which is not true for indexes with ID starting from non-0)
                if (len(links[currObj])==0):
                    break
                else:
                    start_index = (level-1) * 17
                    size = links[currObj][start_index]
                    if debug:
                        print("size of neighbors: ", size) 
                    neighbors = links[currObj][(start_index+1):(start_index+17)]
                    for i in range(size):
                        cand = neighbors[i]
                        currVec = data_l0[cand]
                        dist = calculateDist(q_data, currVec)
                        if debug:
                            print("cand: ", cand, ", dist: ", dist)
                        if (dist < curdist):
                            curdist = dist
                            currObj = cand
                            changed = True
                            if debug:
                                print("changed")
                    if debug:
                        print("one node finish")
                        print("")

        # search in ground layer
        if debug:
            print("")
            print("level: 0")
        visited_array = set() # default 0
        top_candidates = []
        candidate_set = []
        lowerBound = curdist 
        # By default heap queue is a min heap: https://docs.python.org/3/library/heapq.html
        # candidate_set = candidate list, min heap
        # top_candidates = dynamic list (potential results), max heap
        # compare min(candidate_set) vs max(top_candidates)
        heapq.heappush(top_candidates, (-curdist, currObj))
        heapq.heappush(candidate_set,(curdist, currObj))
        visited_array.add(currObj) 

        while len(candidate_set)!=0:
            current_node_pair = candidate_set[0]
            if ((current_node_pair[0] > lowerBound)):
                break
            heapq.heappop(candidate_set)
            current_node_id = current_node_pair[1]
            search_path_local_ID.add(current_node_id)
            size = links_count_l0[current_node_id]
            if debug:
                print("current object: ", current_node_id)
                print("size of neighbors: ", size)
            for i in range(size):
                candidate_id = links_l0[current_node_id][i]
                if (candidate_id not in visited_array):
                    visited_array.add(candidate_id)
                    currVec = data_l0[candidate_id]
                    dist = calculateDist(q_data, currVec)
                    if debug:
                        print("current object: ", candidate_id, ", current distance: ", dist, ", lowerBound: ", lowerBound)
                    if (len(top_candidates) < ef or lowerBound > dist):
                        if debug:
                            print("added")
                        heapq.heappush(candidate_set, (dist, candidate_id))
                        heapq.heappush(top_candidates, (-dist, candidate_id))
                    if (len(top_candidates) > ef):
                        heapq.heappop(top_candidates)
                    if (len(top_candidates)!=0):
                        lowerBound = -top_candidates[0][0]
                else :
                    if debug:
                        print("current object: ", candidate_id, ", visited already")
            if debug:
                print("one node finishes")
                print("")

        while len(top_candidates) > k:
            heapq.heappop(top_candidates)

        result = []
        while len(top_candidates) > 0:
            candidate_pair = top_candidates[0]
            # Wenqi: here, replace the local candidate ID by real node ID, great!
            result.append([-candidate_pair[0], self.local_server_ID, label_l0[candidate_pair[1]]])
            heapq.heappop(top_candidates)
        result.reverse()
            
        for local_ID in search_path_local_ID:
            search_path_vec_ID.add(label_l0[local_ID])

        return result, search_path_local_ID, search_path_vec_ID
        
    def searchKnnPlusRemoteCache(self, q_data, k, ef, all_vectors, debug=False):
        """
        Seach local vectors + cached remote vectors
        Input: 
            all vectors = the entire dataset with N_TOTAL d-dimensional vectors, used to do remote search
        Output:
            a list of local results, in asending distance
            a list of remote results (only with the vectors one hop away from local), in asending distance
            a list of merged results, in asending distance
            whether one should search remote (True/False)
        """
        local_results, search_path_local_ID, search_path_vec_ID = self.searchKnn(q_data, k, ef, debug=debug)
        # get the list of remote vectors that should be visited
        remote_server_ID_vec_ID_list = []
        
        for local_ID in search_path_local_ID:
            link_count = self.remote_links_count[local_ID]
            for i in range(link_count):
                remote_server_ID_vec_ID_list.append(self.remote_links[local_ID][i])
    
        remote_results_heap = []
        for remote_server_ID, vec_ID in remote_server_ID_vec_ID_list:
            dist = np.sum((q_data - all_vectors[vec_ID]) ** 2)
            heapq.heappush(remote_results_heap, (-dist, remote_server_ID, vec_ID))
            
        while len(remote_results_heap) > k:
            heapq.heappop(remote_results_heap)

        remote_results = []
        while len(remote_results_heap) > 0:
            dist, remote_server_ID, vec_ID = remote_results_heap[0]
            remote_results.append((-dist, remote_server_ID, vec_ID))
            heapq.heappop(remote_results_heap)
        remote_results.reverse()
            
        # Merge local + remote
        results = merge_two_distance_list(local_results, remote_results, k)
            
        if remote_results[0][0] < local_results[0][0]:
            search_remote = True
            remote_server_ID = remote_results[0][1]
        else:
            search_remote = False
            remote_server_ID = -1
            
        return results, local_results, remote_results, search_remote, remote_server_ID
        

        
    def searchKnnPlusRemote(self, q_data, k, ef, all_vectors, remote_hnswlib_indexes, remote_server_IDs, debug=False):
        """
        Search local vectors, hop to remote index ***(currently only support 1 index)*** when needed
            *** Thus, this is only a testing functino, in reality, there should be a global search 
                function allowing multiple hops between servers ***
        Input: 
            all vectors = the entire dataset with N_TOTAL d-dimensional vectors, used to do remote search
            remote_hnswlib_indexes: a list of remote_hnswlib_index
                remote_hnswlib_index: index loaded by remote memory (hnswlib object)
            remote_server_IDs: a list of remote index IDs respective to remote_hnswlib_indexes
                e.g., this is server 1, and there are four servers in totol,
                    then the remote index IDs should be [0, 2, 3]
        Output:
            a list of local results, in distance ascending order
            a list of remote results (only with the vectors one hop away from local)
            a list of merged results
            whether one should search remote (True/False)
        """
        local_plus_cach_results, local_results, remote_results, search_remote, remote_server_ID = \
            self.searchKnnPlusRemoteCache(q_data, k, ef, all_vectors, debug=debug)
        
        if search_remote:
            for i, ids in enumerate(remote_server_IDs):
                if ids == remote_server_ID:
                    remote_hnswlib_index = remote_hnswlib_indexes[i]
            
            remote_hnswlib_index.set_ef(ef)
            remote_I, remote_D = remote_hnswlib_index.knn_query(q_data, k=k) # I, D are 2-d array

            # merge results
            remote_results = [(remote_D[0][i], remote_server_ID, remote_I[0][i]) for i in range(remote_I.shape[1])]

            results = merge_two_distance_list(local_results, remote_results, k)
        else:
            results = local_plus_cach_results
            
        return results, search_remote
        

Links format 

bytestrings: 0 0 0 ... 

0 -> no upper layer
N -> N bytes for the following string, this is m x 4 x (1 + M), here M = 16 -> 

0 0 0 68 {string contents} 0 0 136 {string contents}

string contents: first element: valid edge number; rest: vector IDs

e.g.,
[
// first 1 + 16 elements
  10,
  
  1561,
  3999,
  4373,
  4213,
  178,
  6898,
  7020,
  7380,
  8454,
  8779,
  3498,
  3755,
  3999,
  4213,
  4373,
  4374,
  
// second 1 + 16 elements
  15,
  
  1191,
  1298,
  1311,
  1781,
  1930,
  2086,
  2598,
  2925,
  2936,
  3262,
  4374,
  4390,
  5441,
  5546,
  5607,
  0],

## Example 1: 2 sub-graphs

In [None]:
from pathlib import Path
index_path='../indexes_subgraph/SIFT1M_index_subgraph_0.bin'#.format(dbname)
index = Path(index_path).read_bytes()
len(index)

In [None]:
hnsw_index = HNSW_index(local_server_ID=0, dim=128)

The parameters in the index header, stored in small endian
uint64_t offsetLevel0_; // 0:8
uint64_t max_elements_; // 8:16
uint64_t cur_element_count; // 16:24
uint64_t size_data_per_element_; // 24:32
uint64_t label_offset_; // 32:40
uint64_t offsetData_; // 40:48
uint32_t maxlevel_; // 48:52
uint32_t enterpoint_node_; // 52:56
uint64_t maxM_; // 56:64
uint64_t maxM0_; // 64:72
uint64_t M_; // 72:80
double mult_; // 80:88
uint64_t ef_construction_; // 88:96

Results I got from C++ on SIFT1M:

Index file size: 660564936
offsetLevel0_: 0
max_elements_: 1000000
cur_element_count: 1000000
size_data_per_element_: 652
label_offset_: 644
offsetData_: 132
maxlevel_: 5
enterpoint_node_: 572337
maxM_: 16
maxM0_: 32
M_: 16
mult_: 0.360674
ef_construction_: 128

size_data_per_element_ 652 = sizeVec (128 * 4) + maxM0_ * link_num (4) size_link_ID (32 * 4) + vec_ID (8) 

In [None]:
hnsw_index.load_meta_info(index)

In [None]:
t0 = time.time()
HNSW_index.load_ground_layer(hnsw_index, index)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

In [None]:
t0 = time.time()
HNSW_index.load_upper_layers(hnsw_index, index)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

In [None]:
t0 = time.time()
result, search_path_local_ID, search_path_vec_ID = HNSW_index.searchKnn(hnsw_index, xq[0], k=1, ef=128) 
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

In [None]:
print(result)
print(gt[0,0])

In [None]:
print(len(search_path_local_ID), search_path_local_ID) 

In [None]:
print(search_path_vec_ID)

In [None]:
# sorted_search_path_vec_ID = sorted(list(search_path_vec_ID)) 
# sorted_search_path_local_ID = sorted(list(search_path_local_ID))

# for i in range(len(sorted_search_path_local_ID)):
#     if sorted_search_path_local_ID[i] != sorted_search_path_vec_ID[i]:
#         print("i = {}\tsorted_search_path_local_ID = {}\tsorted_search_path_vec_ID = {}".format(
#             i, sorted_search_path_local_ID[i], sorted_search_path_vec_ID[i]))

In [None]:
remote_index_path='../indexes_subgraph/SIFT1M_index_subgraph_1.bin'.format(dbname)
remote_hnswlib_index = hnswlib.Index(space='l2', dim=dim)  # the space can be changed - keeps the data, alters the distance function.
print("\nLoading index from {}\n".format(index_path))
remote_hnswlib_index.load_index(remote_index_path)

In [None]:
remote_hnswlib_indexes = [remote_hnswlib_index]
remote_server_IDs = [1]

t0 = time.time()
HNSW_index.insertRemote(hnsw_index, remote_hnswlib_indexes, remote_server_IDs, ef=128)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

In [None]:
# Search local + remote cache
results, local_results, remote_results, search_remote, remote_server_ID = \
    HNSW_index.searchKnnPlusRemoteCache(hnsw_index, xq[0], k=1, ef=128, all_vectors=xb, debug=True)

In [None]:
print(search_remote)

In [None]:
print(results)
print(local_results)
print(remote_results)
print(gt[0][0])

In [None]:
result_list = []
search_remote_list = []
query_num = 10000

for i in range(query_num):
    results, local_results, remote_results, search_remote, remote_server_ID = HNSW_index.searchKnnPlusRemoteCache(hnsw_index, xq[i], k=1, ef=128, all_vectors=xb, debug=False)
    result_list.append(results[-1])
    search_remote_list.append(search_remote)

In [None]:
## Get recall for consider up to 1 remote hop
## Wenqi comment: even just 1 hop can reach 97% recall!!!
# First 100 queries -> 0.97 recall
# First 1000 queries -> 0.990 recall
# First 10000 queries -> 0.9949 recall (visit remote graph = 0.9978)

count = 0
for i in range(query_num):
    ID = result_list[i][2]
    if ID == gt[i][0]: count += 1
print(count/query_num, count)

In [None]:
# Count how many searches travel to remote node

search_remote_count = 0
for i in range(query_num):
    if search_remote_list[i]: search_remote_count += 1
print(search_remote_count/query_num, search_remote_count)

In [None]:
# Search local + remote when needed
remote_hnswlib_indexes = [remote_hnswlib_index]
remote_server_IDs = [1]

HNSW_index.searchKnnPlusRemote(
    hnsw_index, xq[0], k=10, ef=128, all_vectors=xb, 
    remote_hnswlib_indexes=remote_hnswlib_indexes, remote_server_IDs=remote_server_IDs, debug=False)

In [None]:
result_list = []
search_remote_list = []
query_num = 100

for i in range(query_num):
    results, search_remote = HNSW_index.searchKnnPlusRemote(
        hnsw_index, xq[i], k=1, ef=128, all_vectors=xb, 
        remote_hnswlib_indexes=remote_hnswlib_indexes, remote_server_IDs=remote_server_IDs, debug=False)
    result_list.append(results[-1])
    search_remote_list.append(search_remote)

In [None]:
## Get recall for local + remote
## for first 100 queries; 1 hop can reach 97%; lazy evaluation search all = 98% recall (maybe good partitioning can solve it?)
# First 1000 queries -> 0.993 recall
# First 10000 queries -> 0.9978 -> Very high, should be high enough, so lazy evaluation should work well!

count = 0
for i in range(query_num):
    ID = result_list[i][2]
    if ID == gt[i][0]: count += 1
print(count/query_num, count)

In [None]:
# Count how many searches travel to remote node
# for 10000 queris, 50.19% of search travels to the other server

search_remote_count = 0
for i in range(query_num):
    if search_remote_list[i]: search_remote_count += 1
print(search_remote_count/query_num, search_remote_count)

## Example 2: 4 sub-graphs

In [6]:
from pathlib import Path
index_path='../indexes_subgraph_kmeans/SIFT1M_4_subgraphs/subgraph_0.bin'#.format(dbname)
index = Path(index_path).read_bytes()
print('size: ', len(index))

size:  237123624


In [9]:
hnsw_index = HNSW_index(local_server_ID=0, dim=128)
hnsw_index.load_meta_info(index)

offsetLevel0_ 0
max_elements_ 358987
cur_element_count 358987
size_data_per_element_ 652
label_offset_ 644
offsetData_ 132
maxlevel_ 4
enterpoint_node_ 81740
maxM_ 16
maxM0_ 32
M_ 16
mult_ 0.36067376022224085
ef_construction_ 128


In [10]:
t0 = time.time()
HNSW_index.load_ground_layer(hnsw_index, index)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

time consumption: 1.76 sec


In [11]:
t0 = time.time()
HNSW_index.load_upper_layers(hnsw_index, index)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

time consumption: 0.58 sec


In [12]:
# Search local
t0 = time.time()
result, search_path_local_ID, search_path_vec_ID = HNSW_index.searchKnn(hnsw_index, xq[0], k=1, ef=128) 
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

time consumption: 0.05 sec


In [13]:
print(result)
print(gt[0,0])

[[61125.0, 0, 504814]]
504814


In [20]:
remote_server_IDs = [1,2,3]
remote_index_paths=['../indexes_subgraph_kmeans/SIFT1M_4_subgraphs/subgraph_{}.bin'.format(i) for i in remote_server_IDs]
remote_hnswlib_indexes = [hnswlib.Index(space='l2', dim=dim) for i in remote_server_IDs]
for i, id in enumerate(remote_index_paths):
    print("\nLoading index from {}\n".format(remote_index_paths[i]))
    remote_hnswlib_indexes[i].load_index(remote_index_paths[i])


Loading index from ../indexes_subgraph_kmeans/SIFT1M_4_subgraphs/subgraph_1.bin


Loading index from ../indexes_subgraph_kmeans/SIFT1M_4_subgraphs/subgraph_2.bin


Loading index from ../indexes_subgraph_kmeans/SIFT1M_4_subgraphs/subgraph_3.bin



In [23]:
t0 = time.time()
HNSW_index.insertRemote(hnsw_index, remote_hnswlib_indexes, remote_server_IDs, ef=128)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

time consumption: 61.47 sec


In [24]:
# Search local + remote cache
results, local_results, remote_results, search_remote, remote_server_ID = \
    HNSW_index.searchKnnPlusRemoteCache(hnsw_index, xq[0], k=1, ef=128, all_vectors=xb, debug=True)


level:  4
current object:  81740 , current distance:  165691.0
size of neighbors:  1
cand:  174389 , dist:  128028.0
changed
one node finish

current object:  174389 , current distance:  128028.0
size of neighbors:  1
cand:  81740 , dist:  165691.0
one node finish


level:  3
current object:  174389 , current distance:  128028.0
size of neighbors:  13
cand:  35129 , dist:  257286.0
cand:  49352 , dist:  146393.0
cand:  81740 , dist:  165691.0
cand:  38272 , dist:  183160.0
cand:  168648 , dist:  192195.0
cand:  116957 , dist:  208710.0
cand:  28596 , dist:  181070.0
cand:  102760 , dist:  143478.0
cand:  210280 , dist:  129749.0
cand:  214868 , dist:  101316.0
changed
cand:  259501 , dist:  143103.0
cand:  276429 , dist:  180164.0
cand:  331205 , dist:  192026.0
one node finish

current object:  214868 , current distance:  101316.0
size of neighbors:  6
cand:  67199 , dist:  175542.0
cand:  95630 , dist:  182769.0
cand:  116957 , dist:  208710.0
cand:  174389 , dist:  128028.0
cand:  

added
current object:  35083 , current distance:  104574.0 , lowerBound:  124508.0
added
current object:  69362 , current distance:  138720.0 , lowerBound:  124494.0
current object:  104954 , current distance:  159293.0 , lowerBound:  124494.0
current object:  40833 , current distance:  105833.0 , lowerBound:  124494.0
added
current object:  106425 , visited already
current object:  111930 , current distance:  105948.0 , lowerBound:  124288.0
added
current object:  12072 , current distance:  117202.0 , lowerBound:  124233.0
added
current object:  82059 , visited already
current object:  106311 , current distance:  93644.0 , lowerBound:  124225.0
added
current object:  85623 , current distance:  107371.0 , lowerBound:  124064.0
added
current object:  128139 , current distance:  139241.0 , lowerBound:  123640.0
current object:  129324 , current distance:  135408.0 , lowerBound:  123640.0
current object:  143890 , current distance:  103654.0 , lowerBound:  123640.0
added
current object:  

current object:  210570 , current distance:  130175.0 , lowerBound:  110008.0
current object:  210458 , visited already
current object:  210613 , visited already
current object:  210635 , current distance:  104529.0 , lowerBound:  110008.0
added
current object:  210762 , current distance:  114159.0 , lowerBound:  109876.0
current object:  210805 , current distance:  91040.0 , lowerBound:  109876.0
added
current object:  236637 , current distance:  157963.0 , lowerBound:  109761.0
one node finishes

current object:  210805
size of neighbors:  16
current object:  181758 , current distance:  138296.0 , lowerBound:  109761.0
current object:  106182 , current distance:  173875.0 , lowerBound:  109761.0
current object:  133332 , visited already
current object:  14989 , current distance:  130145.0 , lowerBound:  109761.0
current object:  106374 , current distance:  162269.0 , lowerBound:  109761.0
current object:  14696 , current distance:  116034.0 , lowerBound:  109761.0
current object:  80

current object:  119726 , current distance:  147728.0 , lowerBound:  106918.0
current object:  30735 , current distance:  164020.0 , lowerBound:  106918.0
current object:  82078 , current distance:  137016.0 , lowerBound:  106918.0
current object:  131827 , current distance:  106116.0 , lowerBound:  106918.0
added
current object:  336936 , current distance:  142474.0 , lowerBound:  106852.0
current object:  196289 , current distance:  146519.0 , lowerBound:  106852.0
current object:  269123 , current distance:  129057.0 , lowerBound:  106852.0
current object:  83941 , visited already
current object:  154713 , current distance:  133425.0 , lowerBound:  106852.0
current object:  188037 , visited already
current object:  51110 , current distance:  141864.0 , lowerBound:  106852.0
one node finishes

current object:  133762
size of neighbors:  28
current object:  2844 , current distance:  162048.0 , lowerBound:  106852.0
current object:  39406 , current distance:  132100.0 , lowerBound:  10

current object:  35604
size of neighbors:  32
current object:  291543 , current distance:  110551.0 , lowerBound:  105687.0
current object:  81807 , current distance:  168427.0 , lowerBound:  105687.0
current object:  123575 , current distance:  115008.0 , lowerBound:  105687.0
current object:  44969 , current distance:  136449.0 , lowerBound:  105687.0
current object:  27501 , visited already
current object:  152009 , current distance:  129783.0 , lowerBound:  105687.0
current object:  28222 , current distance:  146955.0 , lowerBound:  105687.0
current object:  210671 , visited already
current object:  33853 , current distance:  137033.0 , lowerBound:  105687.0
current object:  9017 , current distance:  156845.0 , lowerBound:  105687.0
current object:  73366 , current distance:  141408.0 , lowerBound:  105687.0
current object:  133193 , current distance:  164360.0 , lowerBound:  105687.0
current object:  284696 , current distance:  133587.0 , lowerBound:  105687.0
current object:  283

current object:  331066 , current distance:  169299.0 , lowerBound:  104684.0
current object:  138513 , visited already
current object:  341101 , current distance:  131402.0 , lowerBound:  104684.0
one node finishes

current object:  210671
size of neighbors:  19
current object:  123856 , visited already
current object:  76897 , current distance:  144892.0 , lowerBound:  104684.0
current object:  93875 , current distance:  139440.0 , lowerBound:  104684.0
current object:  37758 , current distance:  121464.0 , lowerBound:  104684.0
current object:  35604 , visited already
current object:  182536 , visited already
current object:  152439 , visited already
current object:  40833 , visited already
current object:  205113 , visited already
current object:  51187 , visited already
current object:  210559 , current distance:  122256.0 , lowerBound:  104684.0
current object:  210613 , visited already
current object:  210509 , visited already
current object:  210635 , visited already
current ob

current object:  338679 , current distance:  150762.0 , lowerBound:  103912.0
current object:  355909 , current distance:  158535.0 , lowerBound:  103912.0
one node finishes

current object:  217228
size of neighbors:  23
current object:  59368 , current distance:  143894.0 , lowerBound:  103912.0
current object:  157788 , current distance:  155802.0 , lowerBound:  103912.0
current object:  207240 , current distance:  132988.0 , lowerBound:  103912.0
current object:  121707 , visited already
current object:  95856 , current distance:  102842.0 , lowerBound:  103912.0
added
current object:  133819 , current distance:  115087.0 , lowerBound:  103871.0
current object:  33494 , current distance:  128687.0 , lowerBound:  103871.0
current object:  46839 , current distance:  125034.0 , lowerBound:  103871.0
current object:  201101 , current distance:  130532.0 , lowerBound:  103871.0
current object:  72546 , current distance:  127234.0 , lowerBound:  103871.0
current object:  166184 , current

current object:  317864 , visited already
current object:  284870 , current distance:  138952.0 , lowerBound:  102982.0
current object:  224640 , current distance:  126463.0 , lowerBound:  102982.0
current object:  125273 , visited already
current object:  105067 , current distance:  141485.0 , lowerBound:  102982.0
current object:  290977 , current distance:  134047.0 , lowerBound:  102982.0
current object:  45066 , current distance:  141502.0 , lowerBound:  102982.0
current object:  71897 , visited already
current object:  53643 , current distance:  133947.0 , lowerBound:  102982.0
current object:  210045 , current distance:  129935.0 , lowerBound:  102982.0
current object:  175740 , current distance:  131942.0 , lowerBound:  102982.0
current object:  105003 , current distance:  139143.0 , lowerBound:  102982.0
current object:  319538 , current distance:  155615.0 , lowerBound:  102982.0
current object:  87983 , current distance:  130137.0 , lowerBound:  102982.0
current object:  233

In [25]:
print(search_remote)
print(results)
print(local_results)
print(remote_results)
print(gt[0][0])

False
[(61125.0, 0, 504814)]
[[61125.0, 0, 504814]]
[(112004.0, 1, 225145)]
504814


In [26]:
result_list = []
search_remote_list = []
query_num = 100

for i in range(query_num):
    results, local_results, remote_results, search_remote, remote_server_ID = HNSW_index.searchKnnPlusRemoteCache(hnsw_index, xq[i], k=1, ef=128, all_vectors=xb, debug=False)
    result_list.append(results[-1])
    search_remote_list.append(search_remote)

In [27]:
## Get recall for consider up to 1 remote hop
## Wenqi comment: even just 1 hop can reach xx% recall!!!
# First 100 queries -> 0.xx recall
# First 1000 queries -> 0.xxx recall
# First 10000 queries -> 0.xx recall (visit remote graph = 0.9978)

count = 0
for i in range(query_num):
    ID = result_list[i][2]
    if ID == gt[i][0]: count += 1
print(count/query_num, count)

0.55 55


In [28]:
result_list

[(61125.0, 0, 504814),
 (90500.0, 1, 481578),
 (49682.0, 0, 552515),
 (67326.0, 2, 341623),
 (86317.0, 1, 536507),
 (36847.0, 2, 188730),
 (19658.0, 2, 572053),
 (77786.0, 0, 327960),
 (61815.0, 3, 452279),
 (44113.0, 2, 592948),
 (123587.0, 1, 682344),
 (53513.0, 3, 771023),
 (90584.0, 3, 826761),
 (63552.0, 1, 717949),
 (74573.0, 2, 468791),
 (89593.0, 0, 776345),
 (76238.0, 3, 560049),
 (68137.0, 1, 368300),
 (44529.0, 1, 495223),
 (69042.0, 3, 172071),
 (67778.0, 0, 68023),
 (54462.0, 0, 671173),
 (71914.0, 1, 8813),
 (101565.0, 1, 609333),
 (81364.0, 1, 658686),
 (53304.0, 2, 838692),
 (60544.0, 1, 397855),
 (47047.0, 0, 221028),
 (59493.0, 0, 962851),
 (75169.0, 1, 191053),
 (58396.0, 0, 425493),
 (64474.0, 2, 181775),
 (74212.0, 2, 588756),
 (82034.0, 1, 226194),
 (79119.0, 3, 144178),
 (80708.0, 2, 860667),
 (63324.0, 0, 602485),
 (47917.0, 0, 812777),
 (43526.0, 0, 341091),
 (48413.0, 0, 856200),
 (57088.0, 2, 409148),
 (17417.0, 0, 499763),
 (55637.0, 1, 550479),
 (39624.0, 0

In [None]:
# Count how many searches travel to remote node
search_remote_count = 0
for i in range(query_num):
    if search_remote_list[i]: search_remote_count += 1
print(search_remote_count/query_num, search_remote_count)

In [29]:
# Search local + remote when needed
HNSW_index.searchKnnPlusRemote(
    hnsw_index, xq[0], k=10, ef=128, all_vectors=xb, 
    remote_hnswlib_indexes=remote_hnswlib_indexes, remote_server_IDs=remote_server_IDs, debug=False)

([(61125.0, 0, 504814),
  (62943.0, 0, 344333),
  (73660.0, 0, 900184),
  (77284.0, 0, 657385),
  (77515.0, 0, 205768),
  (80082.0, 0, 278818),
  (82498.0, 0, 831624),
  (83564.0, 0, 885651),
  (84903.0, 0, 504835),
  (85222.0, 0, 504547)],
 False)

In [30]:
result_list = []
search_remote_list = []
query_num = 100

for i in range(query_num):
    results, search_remote = HNSW_index.searchKnnPlusRemote(
        hnsw_index, xq[i], k=1, ef=128, all_vectors=xb, 
        remote_hnswlib_indexes=remote_hnswlib_indexes, remote_server_IDs=remote_server_IDs, debug=False)
    result_list.append(results[-1])
    search_remote_list.append(search_remote)

In [31]:
## Get recall for consider up to 1 remote hop
## Wenqi comment: for k-means-based method, if you start from a bad parition, 
#    you may not end up getting the true nearest neighbor!
# First 100 queries -> 0.9 recall
# First 1000 queries -> 0.xxx recall
# First 10000 queries -> 0.xx recall (visit remote graph = 0.9978)

count = 0
for i in range(query_num):
    ID = result_list[i][2]
    if ID == gt[i][0]: count += 1
print(count/query_num, count)

0.9 90


In [32]:
# Count how many searches travel to remote node
search_remote_count = 0
for i in range(query_num):
    if search_remote_list[i]: search_remote_count += 1
print(search_remote_count/query_num, search_remote_count)

0.62 62
