In [1]:
import hnswlib
import numpy as np 
import struct
import heapq
import time
import pickle
import os

from pathlib import Path

In [2]:
def recall_eval(result_list, gt, k, query_num):
    """
    Input:
        result list: a 2-dim list
            dim 1: query num
            dim 2: topK
        gt: a ground truth 2-d numpy array
            dim 1: query num
            dim 2: topK, 1000 for sift dataset
        k: topK to be used for recall evaluation,
            *** can be anything smaller than the dim2 of result_list ***)
    Output:
        recall
    """

    count = 0
    for i in range(query_num):
        gt_set = set()
        for j in range(k):
            gt_set.add(gt[i][j])
        for j in range(k):
            vec_ID = result_list[i][j]
            if vec_ID in gt_set:
                count += 1
    recall = count / (query_num * k)
    return recall

In [3]:
def save_obj(obj, dirc, name):
    # note use "dir/" in dirc
    with open(os.path.join(dirc, name + '.pkl'), 'wb') as f:
        pickle.dump(obj, f, protocol=4) # for py37,pickle.HIGHEST_PROTOCOL=4

def load_obj(dirc, name):
    with open(os.path.join(dirc, name + '.pkl'), 'rb') as f:
        return pickle.load(f)
    
def convertBytes(bytestring, dtype='int'):
    """
    convert bytes to a single element
    dtype = {int, long, float, double}
    struct: https://docs.python.org/3/library/struct.html
    """ 
    # int from bytes is much faster than struct.unpack
    if dtype =='int' or dtype == 'long': 
        return int.from_bytes(bytestring, byteorder='little', signed=False)
    elif dtype == 'float': 
        return struct.unpack('f', bytestring)[0]
    elif dtype == 'double': 
        return struct.unpack('d', bytestring)[0]
    else:
        raise ValueError 

# Wenqi: the fastest way to load a bytestring list is to use *** np.frombuffer ***
def convertBytesList(bytestring, dtype='int'):
    """
    Given a byte string, return the value list
    """
    result_list = []
    if dtype == 'int' or dtype == 'float':
        dsize = 4
    elif dtype == 'long' or dtype == 'double':
        dsize = 8
    else:
        raise ValueError 
        
    start_pointer = 0
    for i in range(len(bytestring) // dsize):
        result_list.append(convertBytes(
            bytestring[start_pointer: start_pointer + dsize], dtype=dtype))
        start_pointer += dsize
    return result_list

def calculateDist(query_data, db_vec):
    """
    HNSWLib returns L2 square distance, so do we
        both inputs are 1-d np array
    """
    # return l2 distance between two points
    return np.sum((query_data - db_vec) ** 2)


def merge_two_distance_list(list_A, list_B, k):
    """
    merge two lists by selecting the k pairs of the smallest distance
    input:
        both list has format [(dist, ID), (dist, ID), ...]
    return:
        a result list, with ascending distance (the first contains the largest distance)
    """
    
    results_heap = []
    for i in range(len(list_A)):
        dist, server_ID, vec_ID = list_A[i]
        heapq.heappush(results_heap, (-dist, server_ID, vec_ID))
    for i in range(len(list_B)):
        dist, server_ID, vec_ID = list_B[i]
        heapq.heappush(results_heap, (-dist, server_ID, vec_ID))

    while len(results_heap) > k:
        heapq.heappop(results_heap)

    results = []
    while len(results_heap) > 0:
        dist, server_ID, vec_ID = results_heap[0]
        results.append((-dist, server_ID, vec_ID))
        heapq.heappop(results_heap)
    results.reverse()
            
    return results

        
class HNSW_index():
    
    """
    Returned result list always in the format of (dist, server_ID, vec_ID),
        in ascending distance order (the first result is the nearest neighbor)
    """
    
    def __init__(self, local_server_ID=0, dim=128):
        
        self.dim = dim
        self.local_server_ID = local_server_ID
        
        # Meta Info
        self.offsetLevel0_ = None
        self.max_elements_ = None
        self.cur_element_count = None
        self.size_data_per_element_ = None
        self.label_offset_ = None
        self.offsetData_ = None
        self.maxlevel_ = None
        self.enterpoint_node_ = None
        self.maxM_ = None
        self.maxM0_ = None
        self.M_ = None
        self.mult_ = None # the probability that a node is one a higher level
        self.ef_construction_ = None
        
        # Graph partition info
        self.centroid_vectors = None # centroids for all sub-graphs
        
        # ground layer, all with length of cur_element_count
        self.links_count_l0 = None # a list of link_count
        self.links_l0 = None # a list of links per vector
        self.data_l0 = None # a list of vectors
        self.label_l0 = None # a list of vector IDs
        self.vec_ID_to_local_ID = dict() # mapping from vector ID -> local storage ID (on ground layer)
        
        # upper layers, all with length of cur_element_count
        self.element_levels_ = None # the level per vector
        self.links = None # the upper layer link info (link count + links)
        
        # remote nodes, order according to local ID (not label ID)
        #  remote_links: an 2-D array (cur_element_count, k), 
        #    each element is a tuple: (server_ID, vector_ID)
        self.remote_links_count = None
        self.remote_links = None
        
    def load_meta_info(self, index_bin):
        """
        index_bin = hnswlib index binary 
        
        HNSW save index order:
            https://github.com/WenqiJiang/hnswlib-eval/blob/master/hnswlib/hnswalg.h#L588-L616
        """
        self.offsetLevel0_ = int.from_bytes(index_bin[0:8], byteorder='little', signed=False)
        self.max_elements_ = int.from_bytes(index_bin[8:16], byteorder='little', signed=False)
        self.cur_element_count = int.from_bytes(index_bin[16:24], byteorder='little', signed=False)
        self.size_data_per_element_ = int.from_bytes(index_bin[24:32], byteorder='little', signed=False)
        self.label_offset_ = int.from_bytes(index_bin[32:40], byteorder='little', signed=False)
        self.offsetData_ = int.from_bytes(index_bin[40:48], byteorder='little', signed=False)
        self.maxlevel_ = int.from_bytes(index_bin[48:52], byteorder='little', signed=False)
        self.enterpoint_node_ = int.from_bytes(index_bin[52:56], byteorder='little', signed=False)
        self.maxM_ = int.from_bytes(index_bin[56:64], byteorder='little', signed=False)
        self.maxM0_ = int.from_bytes(index_bin[64:72], byteorder='little', signed=False)
        self.M_ = int.from_bytes(index_bin[72:80], byteorder='little', signed=False)
        self.mult_ = struct.unpack('d', index_bin[80:88])[0] # the probability that a node is one a higher level
        self.ef_construction_ = int.from_bytes(index_bin[88:96], byteorder='little', signed=False)
        

        print("offsetLevel0_", self.offsetLevel0_)
        print("max_elements_", self.max_elements_)
        print("cur_element_count", self.cur_element_count)
        print("size_data_per_element_", self.size_data_per_element_)
        print("label_offset_", self.label_offset_)
        print("offsetData_", self.offsetData_)
        print("maxlevel_", self.maxlevel_)
        print("enterpoint_node_", self.enterpoint_node_)
        print("maxM_", self.maxM_)
        print("maxM0_", self.maxM0_)
        print("M_", self.M_)
        print("mult_", self.mult_)
        print("ef_construction_", self.ef_construction_)
        
    def set_centroid_vectors(self, centroid_vectors):
        """
        Given n subgraphs, there will be n centroid vectors,
            the centroid vector of the current sub-graph is 
            self.centroid_vectors[self.local_server_ID]
        Input:
            cluster centroids: (n, dim)
        """
        assert centroid_vectors.shape[1] == self.dim
        self.centroid_vectors = centroid_vectors
        
    def load_ground_layer(self, index_bin):
        """
        Get the ground layer vector ID, vectors, and links:
            links_count_l0: vec_num
            links_l0: maxM0_ * vec_num 
            data_l0: (dim, vec_num)
            label_l0: vec_num
        """
        
        # Layer 0 data 
        start_byte_pointer = 96
        delta = self.cur_element_count * self.size_data_per_element_
        data_level0 = index_bin[start_byte_pointer: start_byte_pointer + delta]
        
        size = len(data_level0)
        self.links_count_l0 = []
        self.links_l0 = np.zeros((self.cur_element_count, self.maxM0_), dtype=int)
        self.data_l0 = np.zeros((self.cur_element_count, self.dim))
        self.label_l0 = []
        self.vec_ID_to_local_ID = dict()

        data_l0_list = []
        
        assert len(data_level0) == self.size_data_per_element_ * self.cur_element_count
        
        size_link_count = 4
        size_links = self.maxM0_ * 4
        size_vectors = self.dim * 4
        size_label = 8
        
        assert self.size_data_per_element_ == \
            size_link_count + size_links + size_vectors + size_label
            
        for i in range(self.cur_element_count):
            # per ground layer node: (link_count (int), links (int array of len=maxM0_), 
            #    vector (float array of len=dim, vector ID (long)))
            
            addr_link_count = i * self.size_data_per_element_ 
            addr_links = addr_link_count + size_link_count
            addr_vectors = addr_links + size_links
            addr_label = addr_vectors + size_vectors
            
            tmp_bytes = data_level0[addr_link_count: addr_link_count + size_link_count]
            self.links_count_l0.append(convertBytes(tmp_bytes, dtype='int'))
        
            tmp_bytes = data_level0[addr_links: addr_links + size_links]
            self.links_l0[i] = np.frombuffer(tmp_bytes, dtype=np.int32)
            
            tmp_bytes = data_level0[addr_vectors: addr_vectors + size_vectors]
            self.data_l0[i] = np.frombuffer(tmp_bytes, dtype=np.float32)
            
            tmp_bytes = data_level0[addr_label: addr_label + size_label]
            vec_ID = convertBytes(tmp_bytes, dtype='long')
            self.label_l0.append(vec_ID)
            self.vec_ID_to_local_ID[vec_ID] = i


    def load_upper_layers(self, index_bin):
        """
        Get the upper layer info:
            element_levels_: the levels of each vector
            links: list of upper links
        """
        
        # meta + ground data
        start_byte_pointer = 96 + self.max_elements_ * self.size_data_per_element_
        
        # Upper layers
        links_count = 0
        size_links_per_element_ = self.maxM_ * 4 + 4
        self.element_levels_ = []
        self.links = []

        for i in range(self.cur_element_count):
            tmp_bytes = index_bin[start_byte_pointer:start_byte_pointer+4]
            linkListSize = convertBytes(tmp_bytes, dtype='int')
            start_byte_pointer += 4
            
            # if an element is only on ground layer, it has no links on upper layers at all
            if linkListSize == 0:
                self.element_levels_.append(0)
                self.links.append([])
            else:
                level = int(linkListSize / size_links_per_element_)
                self.element_levels_.append(level)
                tmp_bytes = index_bin[start_byte_pointer:start_byte_pointer+linkListSize]
                links_tmp = list(np.frombuffer(tmp_bytes, dtype=np.int32))
                start_byte_pointer += linkListSize
                links_count += linkListSize / 4;
                self.links.append(links_tmp)

        assert start_byte_pointer == len(index_bin) # 6606296

    def insertRemote(self, remote_hnswlib_indexes, remote_server_IDs, ef=128):
        """
        Input: 
            remote_hnswlib_indexes: a list of remote_hnswlib_index
                remote_hnswlib_index: index loaded by remote memory (hnswlib object)
            remote_server_IDs: a list of remote index IDs respective to remote_hnswlib_indexes
                e.g., this is server 1, and there are four servers in totol,
                    then the remote index IDs should be [0, 2, 3]
        """
        self.remote_links_count = []
        self.remote_links = []
        
        remote_I_list = []
        remote_D_list = []
        remote_server_ID_list = []
        
        k = self.maxM0_
        
        assert len(remote_hnswlib_indexes) == len(remote_server_IDs)
        
        # query all servers
        for i in range(len(remote_server_IDs)):
            remote_hnswlib_index = remote_hnswlib_indexes[i]
            remote_server_ID = remote_server_IDs[i]
        
            query = self.data_l0
            remote_hnswlib_index.set_ef(ef)
            I, D = remote_hnswlib_index.knn_query(query, k=k)
            remote_I_list.append(I)
            remote_D_list.append(D)
            remote_server_ID_list.append(
                np.ones((I.shape[0], I.shape[1]), dtype=np.int32) * int(remote_server_ID))
    
        # merge results per server
        remote_I = np.concatenate(remote_I_list, axis=1)
        remote_D = np.concatenate(remote_D_list, axis=1)
        remote_server_ID = np.concatenate(remote_server_ID_list, axis=1)
        
        D_server_ID_I_list = [[] for i in range(remote_I.shape[0])]
        server_ID_I_list= [[] for i in range(remote_I.shape[0])]
        for i in range(remote_I.shape[0]):
            for j in range(remote_I.shape[1]):
                D_server_ID_I_list[i].append((remote_D[i][j], remote_server_ID[i][j], remote_I[i][j]))
            D_server_ID_I_list[i].sort()
            D_server_ID_I_list[i] = D_server_ID_I_list[i][:k]
            server_ID_I_list[i] = [(s, i) for d, s, i in D_server_ID_I_list[i]]
                
        
        self.remote_links_count = [k for i in range(self.cur_element_count)]
        #  remote_links: an 2-D array (cur_element_count x k), 
        #    each element is a tuple: (server_ID, vector_ID)
        self.remote_links = server_ID_I_list
        
    def searchKnnGroundLayer(self, q_data, k, ef, ep_local_id, existing_results=None):
        """
        The ground layer searching process.
        Input:
            query vector
            topk
            ef
            ep_local_id: entry point of the ground layer (local ID, not real vec ID)
            existing_results: a list of search results on previous servers
        Output:
            topK results: list of (dist, serverID, vecID) in ascending distance order
            search_path_local_ID_gnd: ground layer search path (local node ID) 
            search_path_vec_ID_gnd: ground layer search path (vector ID)
        """
        
        num_elements = self.cur_element_count
        links_count_l0 = self.links_count_l0
        links_l0 = self.links_l0
        data_l0 = self.data_l0
        label_l0 = self.label_l0
        dim = self.dim
        
        search_path_local_ID_gnd = set()
        search_path_vec_ID_gnd = set()
        
        # dynamic list (result candidates): (-dist, server_ID, vec_ID)
        top_candidates = [] 
        # candidate list: (dist, local_ID)
        candidate_set = []
        # local visisted vectors
        visited_array = set() 
        
        
        ep_dist = calculateDist(q_data, data_l0[ep_local_id])
        distUpperBound = ep_dist 
        # By default heap queue is a min heap: https://docs.python.org/3/library/heapq.html
        # candidate_set = candidate list, min heap
        # top_candidates = dynamic list (potential results), max heap
        # compare min(candidate_set) vs max(top_candidates)
        if not existing_results:
            vec_ID = label_l0[ep_local_id]
            heapq.heappush(top_candidates, (-ep_dist, self.local_server_ID, vec_ID))
            heapq.heappush(candidate_set, (ep_dist, ep_local_id))
            visited_array.add(ep_local_id) 
        else:
            for dist, server_ID, vec_ID in existing_results:
                heapq.heappush(top_candidates, (-dist, server_ID, vec_ID))
                if server_ID == self.local_server_ID: 
                    local_ID = self.vec_ID_to_local_ID[vec_ID]
                    heapq.heappush(candidate_set, (dist, local_ID))
                    visited_array.add(local_ID)
                
                
        while len(candidate_set)!=0:
            current_node_dist, current_node_id = candidate_set[0]
            if ((current_node_dist > distUpperBound)):
                break
            heapq.heappop(candidate_set)
            search_path_local_ID_gnd.add(current_node_id)
            size = links_count_l0[current_node_id]
            
            for i in range(size):
                candidate_id = links_l0[current_node_id][i]
                if (candidate_id not in visited_array):
                    visited_array.add(candidate_id)
                    currVec = data_l0[candidate_id]
                    dist = calculateDist(q_data, currVec)
                    
                    if (len(top_candidates) < ef or distUpperBound > dist):
                        heapq.heappush(candidate_set, (dist, candidate_id))
                        vec_ID = label_l0[candidate_id]
                        heapq.heappush(top_candidates, (-dist, self.local_server_ID, vec_ID))
                    if (len(top_candidates) > ef):
                        heapq.heappop(top_candidates)
                    if (len(top_candidates)!=0):
                        distUpperBound = -top_candidates[0][0] 
                     

        while len(top_candidates) > k:
            heapq.heappop(top_candidates)

        result = []
        while len(top_candidates) > 0:
            minus_dist, server_ID, vec_ID = top_candidates[0]
            result.append([-minus_dist, server_ID, vec_ID])
            heapq.heappop(top_candidates)
        result.reverse()
            
        for local_ID in search_path_local_ID_gnd:
            search_path_vec_ID_gnd.add(label_l0[local_ID])

        return result, search_path_local_ID_gnd, search_path_vec_ID_gnd
        
    def searchKnn(self, q_data, k, ef):
        """
        The HNSW way to search knn, from top layer all the way down to the bottom.
        result a list of (distance, vec_ID) in ascending distance
        """
        
        ep_node = self.enterpoint_node_
        max_level = self.maxlevel_
        links = self.links
        data_l0 = self.data_l0
        label_l0 = self.label_l0
        dim = self.dim
        
        currObj = ep_node
        currVec = data_l0[currObj]
        curdist = calculateDist(q_data, currVec)
        
        search_path_local_ID_upper = set()
        search_path_vec_ID_upper = set()
        
        # search upper layers
        for level in reversed(range(1, max_level+1)):
            
            changed = True
            while changed:
            
                search_path_local_ID_upper.add(currObj)
                changed = False
                if (len(links[currObj])==0):
                    break
                else:
                    start_index = (level-1) * 17
                    size = links[currObj][start_index]
                    neighbors = links[currObj][(start_index+1):(start_index+17)]
                    
                    for i in range(size):
                        cand = neighbors[i]
                        currVec = data_l0[cand]
                        dist = calculateDist(q_data, currVec)
                        if (dist < curdist):
                            curdist = dist
                            currObj = cand
                            changed = True
                            
        for local_ID in search_path_local_ID_upper:
            search_path_vec_ID_upper.add(label_l0[local_ID])
            
        # search in ground layer
        result, search_path_local_ID_gnd, search_path_vec_ID_gnd = \
            self.searchKnnGroundLayer(q_data, k, ef, ep_local_id=currObj)
        
        return result, search_path_local_ID_upper, search_path_vec_ID_upper, search_path_local_ID_gnd, search_path_vec_ID_gnd
        

In [4]:
index_path = '../indexes/SIFT100K_index.bin'
index = Path(index_path).read_bytes()
dim = 128
hnsw_index = HNSW_index(local_server_ID=0, dim=dim)
hnsw_index.load_meta_info(index)

print('load ground layer...')
t0 = time.time()
hnsw_index.load_ground_layer(index)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

print('load upper layer...')
t0 = time.time()
hnsw_index.load_upper_layers(index)
t1 = time.time()
print("time consumption: {:.2f} sec".format(t1 - t0))

offsetLevel0_ 0
max_elements_ 100000
cur_element_count 100000
size_data_per_element_ 652
label_offset_ 644
offsetData_ 132
maxlevel_ 4
enterpoint_node_ 81624
maxM_ 16
maxM0_ 32
M_ 16
mult_ 0.36067376022224085
ef_construction_ 128
load ground layer...
time consumption: 0.53 sec
load upper layer...
time consumption: 0.12 sec


In [5]:
ef = hnsw_index.ef_construction_
print(ef)

128


In [6]:
def mmap_fvecs(fname):
    x = np.memmap(fname, dtype='int32', mode='r')
    d = x[0]
    return x.view('float32').reshape(-1, d + 1)[:, 1:]

def mmap_bvecs(fname):
    x = np.memmap(fname, dtype='uint8', mode='r')
    d = x[:4].view('int32')[0]
    return x.reshape(-1, d + 4)[:, 4:]

def ivecs_read(fname):
    a = np.fromfile(fname, dtype='int32')
    d = a[0]
    # Wenqi: Format of ground truth (for 10000 query vectors):
    #   1000(topK), [1000 ids]
    #   1000(topK), [1000 ids]
    #        ...     ...
    #   1000(topK), [1000 ids]
    # 10000 rows in total, 10000 * 1001 elements, 10000 * 1001 * 4 bytes
    return a.reshape(-1, d + 1)[:, 1:].copy()

def fvecs_read(fname):
    return ivecs_read(fname).view('float32')

In [7]:
dbname = 'SIFT1M'
dim=128

if dbname.startswith('SIFT'):
    # SIFT1M to SIFT1000M
    dbsize = int(dbname[4:-1])
    xb = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_base.bvecs')
    xq = mmap_bvecs('/mnt/scratch/wenqi/Faiss_experiments/bigann/bigann_query.bvecs')
    gt = ivecs_read('/mnt/scratch/wenqi/Faiss_experiments/bigann/gnd/idx_%dM.ivecs' % dbsize)

    N_VEC = int(dbsize * 1000 * 1000)

    # trim xb to correct size
    xb = xb[:dbsize * 1000 * 1000]
    xb = xb.astype('float32').copy()

    # Wenqi: load xq to main memory and reshape
    xq = xq.astype('float32').copy()
    xq = np.array(xq, dtype=np.float32)
    gt = np.array(gt, dtype=np.int32)

    print("Vector shapes:")
    print("Base vector xb: ", xb.shape)
    print("Query vector xq: ", xq.shape)
    print("Ground truth gt: ", gt.shape)
else:
    print('unknown dataset', dbname, file=sys.stderr)
    sys.exit(1)

Vector shapes:
Base vector xb:  (1000000, 128)
Query vector xq:  (10000, 128)
Ground truth gt:  (10000, 1000)


In [8]:
index_path='../indexes/{}_index.bin'.format('SIFT100K')
p = hnswlib.Index(space='l2', dim=dim)  # the space can be changed - keeps the data, alters the distance function.
print("\nLoading index from {}\n".format(index_path))
p.load_index(index_path)


Loading index from ../indexes/SIFT100K_index.bin



In [9]:
p.set_ef(128)
all_queries = xb.astype('float32')
print(all_queries.shape)
I, D = p.knn_query(all_queries, k=1)

(1000000, 128)


In [10]:
I.shape

(1000000, 1)

In [11]:
"""
Creat a mapping,
    for each vector in the graph, attach a list of close vectors that will be stored on disk
"""
I_list = list(I.reshape(-1))

node_to_vec_list = dict()
for vec_ID in hnsw_index.vec_ID_to_local_ID: # enumerate all vec_ID in the graph
    node_to_vec_list[vec_ID] = []
    
for i, vec_ID in enumerate(I_list):
    node_to_vec_list[vec_ID].append(i)

In [27]:
node_to_vec_list[0]

[0,
 203483,
 226433,
 317122,
 395686,
 414817,
 487099,
 520895,
 570523,
 575383,
 591816,
 601326,
 608767,
 648378,
 835152,
 862636,
 867816,
 989477]

In [25]:
hnsw_index.searchKnn(xq[0], k=100, ef=128)

([[54514.0, 0, 76346],
  [55145.0, 0, 45476],
  [56291.0, 0, 13166],
  [56684.0, 0, 21706],
  [60442.0, 0, 57428],
  [61629.0, 0, 69782],
  [62416.0, 0, 69873],
  [63540.0, 0, 84944],
  [63946.0, 0, 58420],
  [65414.0, 0, 76705],
  [65518.0, 0, 19596],
  [66519.0, 0, 45763],
  [66641.0, 0, 54589],
  [68240.0, 0, 67006],
  [69543.0, 0, 98550],
  [70368.0, 0, 89653],
  [71844.0, 0, 40734],
  [73618.0, 0, 26815],
  [73676.0, 0, 94977],
  [73906.0, 0, 13916],
  [74079.0, 0, 98690],
  [74373.0, 0, 42889],
  [75131.0, 0, 79036],
  [75489.0, 0, 77265],
  [76520.0, 0, 63517],
  [76649.0, 0, 6501],
  [76728.0, 0, 77279],
  [76828.0, 0, 80666],
  [77446.0, 0, 71399],
  [77467.0, 0, 70184],
  [78006.0, 0, 81792],
  [78051.0, 0, 92766],
  [78226.0, 0, 75905],
  [78361.0, 0, 79491],
  [79128.0, 0, 31430],
  [79492.0, 0, 63283],
  [79699.0, 0, 93064],
  [80098.0, 0, 98558],
  [80354.0, 0, 74725],
  [80371.0, 0, 10516],
  [80421.0, 0, 63300],
  [80747.0, 0, 64336],
  [80834.0, 0, 79612],
  [80839.0, 

In [26]:
I, D = p.knn_query(xq[0], k=100)
print(I)

[[76575 36011 61812 24740 66855  7397 23998 99134  8143  7408 74033 66759
  16367 37956 89589 92776 22199 75481 76562 68396 54622 89330 89590 13898
  13576 83451 89339 50381 16404 60687 77702 86641 13590 45919 35317 22487
  13549 19746 76583 60434 66925  8852 89375  7185 90275 16259 89072 77724
  40908   726 17655  7409  3793 89329 90182 88964 88778  9437 41039 46234
  35777 37902 89080 13625 85599 23047 28376 89132 34247 76568 63804  8261
  22937 54658 99409 57750 81081 88746 80551 58037   331 81349 88774 29595
  77830 65256 88427 47579 76579 22951 22205 47504 28284 71323 84039 35329
  75525 75463  9207 76561]]


In [15]:
hnsw_index.searchKnn(xq[0], k=1, ef=128)

([[92329.0, 0, 76575]],
 {8857, 12541, 60587, 81624, 88525, 88904},
 {8857, 12541, 60587, 81624, 88525, 88904},
 {331,
  726,
  3793,
  4497,
  7185,
  7397,
  7408,
  7409,
  7738,
  8143,
  8169,
  8214,
  8233,
  8261,
  8852,
  9207,
  9437,
  12545,
  13549,
  13576,
  13590,
  13625,
  13898,
  15769,
  16259,
  16367,
  16404,
  17655,
  19344,
  19746,
  21659,
  22199,
  22205,
  22487,
  22766,
  22937,
  22951,
  23047,
  23998,
  24740,
  28284,
  28376,
  29595,
  30992,
  34247,
  34414,
  35317,
  35329,
  35777,
  35902,
  36011,
  37902,
  37956,
  40683,
  40684,
  40685,
  40908,
  41039,
  43180,
  45887,
  45919,
  46234,
  47504,
  47579,
  50381,
  54622,
  54658,
  57537,
  57750,
  57951,
  57957,
  58037,
  60434,
  60687,
  61812,
  63804,
  65256,
  66759,
  66855,
  66925,
  68396,
  71323,
  74033,
  75463,
  75481,
  75525,
  76561,
  76562,
  76568,
  76575,
  76579,
  76583,
  77702,
  77724,
  77830,
  80551,
  81081,
  81349,
  83241,
  83451,
  84039

In [35]:
I, D = p.knn_query(xq, k=1)

In [36]:
recall_eval(I, gt, k=1, query_num=100)

0.12

In [47]:
"""
Get the search path on the ground layer, search all attached list
"""
ef = 128
k = 100
query_num = 1000

result_list_graph_nodes = []
search_path_list = []

for i in range(query_num):
    result, search_path_local_ID_upper, search_path_vec_ID_upper, search_path_local_ID_gnd, search_path_vec_ID_gnd = \
        hnsw_index.searchKnn(xq[i], k, ef)
    result_list_graph_nodes.append(result)
    search_path_list.append(search_path_vec_ID_gnd)

In [48]:
result_list = []
for ls in result_list_graph_nodes:
    tmp_list = []
    for dist, server_ID, vec_ID in ls:
        tmp_list.append(vec_ID)
    result_list.append(tmp_list)

In [60]:
""" Wenqi: basically, the result list is a subset of visited search path 
        This means without clustering, just using random DB vectors as centroid is not enough
"""
# c = 0
# for i, vec_ID in enumerate(result_list[-1]):
#     if vec_ID in search_path_vec_ID_gnd: c += 1
# print(c)

100


In [54]:
recall_eval(result_list, gt, k=1, query_num=query_num)

0.117

In [None]:
# tmp = []
# for i in range(query_num):
#     dist_vec_list = []
#     print("search path", search_path_list[i], len(search_path_list[i]))
#     for node_vec_ID in search_path_list[i]:
#         for disk_vec_ID in node_to_vec_list[node_vec_ID]:
#             tmp.append(disk_vec_ID)
#         break


In [50]:
final_result = []

for i in range(query_num):
    dist_vec_list = []
    for node_vec_ID in search_path_list[i]:
        for disk_vec_ID in node_to_vec_list[node_vec_ID]:
            
            vec = xb[disk_vec_ID]
            dist = np.sum((vec - xq[i]) ** 2)
            dist_vec_list.append((dist, disk_vec_ID))
#     print(dist_vec_list)
    dist_vec_list.sort()
#     dist_vec_list = dist_vec_list[:k]
    final_result.append([vec_ID for dist, vec_ID in dist_vec_list])

In [53]:
recall_eval(final_result, gt, k=1, query_num=query_num)

0.902

## Try clustering based method

In [55]:
from sklearn.cluster import KMeans

In [57]:
print('learning k-means clusters...')
train_size = int(1e6)
n_clusters = int(1e5) # 100 K centroids
n_seeds = 1 # only run 1 seed
kmeans = KMeans(n_clusters=n_clusters, n_init=n_seeds) 
xt = xb[:train_size]
kmeans.fit(xt)
print('finish learning k-means clusters...')

learning k-means clusters...


MemoryError: Unable to allocate 373. GiB for an array with shape (1000000, 100000) and data type float32

In [None]:
centroid_vectors = kmeans.cluster_centers_