Skip to content

Commit

Permalink
resizing to align blocksize (#177)
Browse files Browse the repository at this point in the history
* modified resize logic of BF addvector so that the ids2vectors mapping size fits blockSize

* fixed increasing count in case od re-using id, update brute_force_reindexing_same_vector test

* clang format

* HNSWIndex::addVector aligns the capacity to block size, HNSWIndex::deleteVector in case an entire block is removed, the capacity is aligned to blocksize and a decreased by block size, added test for resize cases in hnswlib

* make format

* update size estimation test

* fixed after Alon's review

* clang format

* cahnged hnsw::remove point to return false in case not found,
also changed HNSWIndex::delete vector to return false in this case (before checking if resizing is required) - this covers (hopedully) also empty index case and resizing only when actual deletion happend

* added tests for empty HNSW index

* added isLabelExists to hnswlib and modified test of id overrrides

* hnsw::removepoint return void, checking if label exists happens only in the wrrapper

* clang format

* update test to removed checking if element count is bigger than max_ellements from addPoint (cant happen, we check this in addVector)

* override test intial size

* remove if exists in addvector update

* Update src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp

Co-authored-by: alonre24 <alonreshef24@gmail.com>

* Update tests/unit/test_hnswlib.cpp

Co-authored-by: alonre24 <alonreshef24@gmail.com>

* Update tests/unit/test_hnswlib.cpp

Co-authored-by: alonre24 <alonreshef24@gmail.com>

* removed redudent tests after alon's review

* fixed tests

* Update src/VecSim/algorithms/brute_force/brute_force.cpp

Co-authored-by: DvirDukhan <dvir@redis.com>

* Update tests/unit/test_hnswlib.cpp

Co-authored-by: DvirDukhan <dvir@redis.com>

* empty index and capacity = 0  tests. removed todo from hnsw_wrapper

Co-authored-by: alonre24 <alonreshef24@gmail.com>
Co-authored-by: DvirDukhan <dvir@redis.com>
  • Loading branch information
3 people committed Aug 1, 2022
1 parent 8e88725 commit ba8b272
Show file tree
Hide file tree
Showing 5 changed files with 333 additions and 105 deletions.
34 changes: 20 additions & 14 deletions src/VecSim/algorithms/brute_force/brute_force.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,22 @@ int BruteForceIndex::addVector(const void *vector_data, size_t label) {
if (this->deletedIds.size() != 0) {
id = *this->deletedIds.begin();
this->deletedIds.erase(this->deletedIds.begin());
this->count++;
} else {
id = this->count++;
id = count;
// Save current id2vec size.
size_t ids_mapping_size = idToVectorBlockMemberMapping.size();

// If it's full - resize the index to be a multiplication of block size.
if (id >= ids_mapping_size) {
size_t last_block_vectors_count = count % vectorBlockSize;
this->idToVectorBlockMemberMapping.resize(ids_mapping_size + vectorBlockSize -
last_block_vectors_count);
}
}
}

// See if new id is bigger than current vector count. Needs to resize the index.
if (id >= this->idToVectorBlockMemberMapping.size()) {
this->idToVectorBlockMemberMapping.resize(std::ceil(this->count * 1.1));
}
// Anyway - increse count.
++count;

// Get vector block to store the vector in.
VectorBlock *vectorBlock;
Expand Down Expand Up @@ -127,7 +133,7 @@ int BruteForceIndex::deleteVector(size_t label) {
idType id;
auto optionalId = this->labelToIdLookup.find(label);
if (optionalId == this->labelToIdLookup.end()) {
// Nothing to delete;
// Nothing to delete.
return true;
} else {
id = optionalId->second;
Expand All @@ -142,21 +148,21 @@ int BruteForceIndex::deleteVector(size_t label) {
VectorBlockMember *lastVectorBlockMember =
lastVectorBlock->getMember(lastVectorBlock->getLength() - 1);

// Swap the last vector with the deleted vector;
// Swap the last vector with the deleted vector.
vectorBlock->setMember(vectorIndex, lastVectorBlockMember);

float *destination = vectorBlock->getVector(vectorIndex);
float *origin = lastVectorBlock->removeAndFetchVector();
memmove(destination, origin, sizeof(float) * this->dim);

// Delete the vector block membership
// Delete the vector block membership.
delete vectorBlockMember;
this->idToVectorBlockMemberMapping[id] = NULL;
// Add deleted id to reusable ids.
this->deletedIds.emplace(id);
this->labelToIdLookup.erase(label);

// If the last vector block is emtpy;
// If the last vector block is emtpy.
if (lastVectorBlock->getLength() == 0) {
delete lastVectorBlock;
this->vectorBlocks.pop_back();
Expand Down Expand Up @@ -206,7 +212,7 @@ VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t
void *timeoutCtx = queryParams ? queryParams->timeoutCtx : NULL;

this->last_mode = STANDARD_KNN;
float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic
memcpy(normalized_blob, queryBlob, this->dim * sizeof(float));
Expand All @@ -216,7 +222,7 @@ VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t

float upperBound = std::numeric_limits<float>::lowest();
vecsim_stl::max_priority_queue<pair<float, labelType>> TopCandidates(this->allocator);
// For every block, compute its vectors scores and update the Top candidates max heap
// For every block, compute its vectors scores and update the Top candidates max heap.
for (auto vectorBlock : this->vectorBlocks) {
auto scores = computeBlockScores(vectorBlock, queryBlob, timeoutCtx, &rl.code);
if (VecSim_OK != rl.code) {
Expand Down Expand Up @@ -256,7 +262,7 @@ VecSimQueryResult_List BruteForceIndex::rangeQuery(const void *queryBlob, float
void *timeoutCtx = queryParams ? queryParams->timeoutCtx : nullptr;
this->last_mode = RANGE_QUERY;

float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic when other types will be supported.
memcpy(normalized_blob, queryBlob, this->dim * sizeof(float));
Expand Down Expand Up @@ -299,7 +305,7 @@ VecSimIndexInfo BruteForceIndex::info() const {

VecSimInfoIterator *BruteForceIndex::infoIterator() {
VecSimIndexInfo info = this->info();
// For readability. Update this number when needed;
// For readability. Update this number when needed.
size_t numberOfInfoFields = 8;
VecSimInfoIterator *infoIterator = new VecSimInfoIterator(numberOfInfoFields);

Expand Down
50 changes: 38 additions & 12 deletions src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ size_t HNSWIndex::estimateInitialSize(const HNSWParams *params) {
sizeof(size_t);
est += sizeof(*hnsw) + sizeof(size_t);
est += sizeof(VisitedNodesHandler) + sizeof(size_t);
// used for synchronization only when parallel indexing / searching is enabled.
// Used for synchronization only when parallel indexing / searching is enabled.
#ifdef ENABLE_PARALLELIZATION
est += sizeof(VisitedNodesHandlerPool);
#endif
Expand All @@ -46,7 +46,7 @@ size_t HNSWIndex::estimateInitialSize(const HNSWParams *params) {
est += sizeof(void *) * params->initialCapacity + sizeof(size_t); // link lists (for levels > 0)
est += sizeof(size_t) * params->initialCapacity + sizeof(size_t); // element level
est += sizeof(size_t) * params->initialCapacity +
sizeof(size_t); // labels lookup hash table buckets
sizeof(size_t); // Labels lookup hash table buckets.

size_t size_links_level0 =
sizeof(linklistsizeint) + params->M * 2 * sizeof(tableint) + sizeof(void *);
Expand Down Expand Up @@ -92,16 +92,24 @@ size_t HNSWIndex::estimateElementMemory(const HNSWParams *params) {
}

int HNSWIndex::addVector(const void *vector_data, size_t id) {

// If id already exists remove and re-add
if (this->hnsw->isLabelExist(id)) {
this->hnsw->removePoint(id);
}

try {
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic
memcpy(normalized_data, vector_data, this->dim * sizeof(float));
float_vector_normalize(normalized_data, this->dim);
vector_data = normalized_data;
}
if (hnsw->getIndexSize() == this->hnsw->getIndexCapacity()) {
this->hnsw->resizeIndex(this->hnsw->getIndexCapacity() + this->blockSize);
size_t index_capacity = this->hnsw->getIndexCapacity();
if (hnsw->getIndexSize() == index_capacity) {
size_t vectors_to_add = blockSize - index_capacity % blockSize;
this->hnsw->resizeIndex(index_capacity + vectors_to_add);
}
this->hnsw->addPoint(vector_data, id);
return true;
Expand All @@ -111,11 +119,29 @@ int HNSWIndex::addVector(const void *vector_data, size_t id) {
}

int HNSWIndex::deleteVector(size_t id) {
bool res = this->hnsw->removePoint(id);
if (hnsw->getIndexSize() + this->blockSize <= this->hnsw->getIndexCapacity()) {
this->hnsw->resizeIndex(this->hnsw->getIndexCapacity() - this->blockSize);

// If id doesnt exist.
if (!this->hnsw->isLabelExist(id)) {
return false;
}
return res;

// Else, *delete* it from the graph.
this->hnsw->removePoint(id);

size_t index_size = hnsw->getIndexSize();
size_t curr_capacity = this->hnsw->getIndexCapacity();

// If we need to free a complete block & there is a least one block between the
// capacity and the size.
if (index_size % blockSize == 0 && index_size + blockSize <= curr_capacity) {

// Check if the capacity is aligned to block size.
size_t extra_space_to_free = curr_capacity % blockSize;

// Remove one block from the capacity.
this->hnsw->resizeIndex(curr_capacity - blockSize - extra_space_to_free);
}
return true;
}

double HNSWIndex::getDistanceFrom(size_t label, const void *vector_data) {
Expand All @@ -132,7 +158,7 @@ VecSimQueryResult_List HNSWIndex::topKQuery(const void *query_data, size_t k,
void *timeoutCtx = nullptr;
try {
this->last_mode = STANDARD_KNN;
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic
memcpy(normalized_data, query_data, this->dim * sizeof(float));
Expand All @@ -158,7 +184,7 @@ VecSimQueryResult_List HNSWIndex::topKQuery(const void *query_data, size_t k,
VecSimQueryResult_SetScore(rl.results[i], knn_res.top().first);
knn_res.pop();
}
// Restore efRuntime
// Restore efRuntime.
hnsw->setEf(originalEF);
assert(hnsw->getEf() == originalEF);

Expand Down Expand Up @@ -204,7 +230,7 @@ VecSimBatchIterator *HNSWIndex::newBatchIterator(const void *queryBlob,

VecSimInfoIterator *HNSWIndex::infoIterator() {
VecSimIndexInfo info = this->info();
// For readability. Update this number when needed;
// For readability. Update this number when needed.
size_t numberOfInfoFields = 12;
VecSimInfoIterator *infoIterator = new VecSimInfoIterator(numberOfInfoFields);

Expand Down
23 changes: 9 additions & 14 deletions src/VecSim/algorithms/hnsw/hnswlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,14 @@ class HierarchicalNSW : public VecsimBaseObject {
linklistsizeint *get_linklist_at_level(tableint internal_id, size_t level) const;
unsigned short int getListCount(const linklistsizeint *ptr) const;
void resizeIndex(size_t new_max_elements);
bool removePoint(labeltype label);
void removePoint(labeltype label);
void addPoint(const void *data_point, labeltype label);
dist_t getDistanceByLabelFromPoint(labeltype label, const void *data_point);
tableint searchBottomLayerEP(const void *query_data, void *timeoutCtx,
VecSimQueryResult_Code *rc) const;
vecsim_stl::max_priority_queue<pair<dist_t, labeltype>>
searchKnn(const void *query_data, size_t k, void *timeoutCtx, VecSimQueryResult_Code *rc) const;
bool isLabelExist(labeltype label);
};

/**
Expand Down Expand Up @@ -322,6 +323,10 @@ VisitedNodesHandler *HierarchicalNSW<dist_t>::getVisitedList() const {
return visited_nodes_handler.get();
}

template <typename dist_t>
bool HierarchicalNSW<dist_t>::isLabelExist(labeltype label) {
return (label_lookup_.find(label) != label_lookup_.end());
}
/**
* helper functions
*/
Expand Down Expand Up @@ -945,11 +950,8 @@ void HierarchicalNSW<dist_t>::resizeIndex(size_t new_max_elements) {
}

template <typename dist_t>
bool HierarchicalNSW<dist_t>::removePoint(const labeltype label) {
// check that the label actually exists in the graph, and update the number of elements.
if (label_lookup_.find(label) == label_lookup_.end()) {
return true;
}
void HierarchicalNSW<dist_t>::removePoint(const labeltype label) {

tableint element_internal_id = label_lookup_[label];
vecsim_stl::vector<bool> neighbours_bitmap(allocator);

Expand Down Expand Up @@ -1029,7 +1031,6 @@ bool HierarchicalNSW<dist_t>::removePoint(const labeltype label) {
} else {
SwapLastIdWithDeletedId(element_internal_id, last_element_internal_id);
}
return true;
}

template <typename dist_t>
Expand All @@ -1041,13 +1042,7 @@ void HierarchicalNSW<dist_t>::addPoint(const void *data_point, const labeltype l
#ifdef ENABLE_PARALLELIZATION
std::unique_lock<std::mutex> templock_curr(cur_element_count_guard_);
#endif
// Checking if an element with the given label already exists. if so, remove it.
if (label_lookup_.find(label) != label_lookup_.end()) {
removePoint(label);
}
if (cur_element_count >= max_elements_) {
throw std::runtime_error("The number of elements exceeds the specified limit");
}

cur_c = max_id = cur_element_count++;
label_lookup_[label] = cur_c;
}
Expand Down
Loading

0 comments on commit ba8b272

Please sign in to comment.