Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

resizing to align blocksize #177

Merged
merged 28 commits into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
50788eb
modified resize logic of BF addvector so that the ids2vectors mapping…
meiravgri Jul 27, 2022
ea40e48
fixed increasing count in case od re-using id, update brute_force_rei…
meiravgri Jul 27, 2022
7e79daa
clang format
meiravgri Jul 27, 2022
312cfea
HNSWIndex::addVector aligns the capacity to block size, HNSWIndex::de…
meiravgri Jul 27, 2022
2948388
make format
meiravgri Jul 27, 2022
04f07da
update size estimation test
meiravgri Jul 27, 2022
97eaae9
Merge branch 'main' into update_resize_logic
alonre24 Jul 27, 2022
b277fca
fixed after Alon's review
meiravgri Jul 28, 2022
68b0106
Merge branch 'update_resize_logic' of https://github.com/RedisAI/Vect…
meiravgri Jul 28, 2022
58d639e
clang format
meiravgri Jul 28, 2022
3de7b8c
cahnged hnsw::remove point to return false in case not found,
meiravgri Jul 28, 2022
aec50b2
added tests for empty HNSW index
meiravgri Jul 28, 2022
f3021d1
added isLabelExists to hnswlib and modified test of id overrrides
meiravgri Jul 28, 2022
7fa6cac
hnsw::removepoint return void, checking if label exists happens only …
meiravgri Jul 28, 2022
3acbf38
clang format
meiravgri Jul 28, 2022
d8d1f01
update test to removed checking if element count is bigger than max_e…
meiravgri Jul 28, 2022
14990ee
override test intial size
meiravgri Jul 28, 2022
2bb1b4f
remove if exists in addvector update
meiravgri Jul 28, 2022
87be64f
Merge branch 'main' into update_resize_logic
meiravgri Jul 29, 2022
4d26b35
Update src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp
meiravgri Jul 31, 2022
d8aee57
Update tests/unit/test_hnswlib.cpp
meiravgri Jul 31, 2022
394e792
Update tests/unit/test_hnswlib.cpp
meiravgri Jul 31, 2022
4cfa183
removed redudent tests after alon's review
meiravgri Jul 31, 2022
809d051
Merge branch 'update_resize_logic' of https://github.com/RedisAI/Vect…
meiravgri Jul 31, 2022
4aa3ca5
fixed tests
meiravgri Jul 31, 2022
cf3bb52
Update src/VecSim/algorithms/brute_force/brute_force.cpp
meiravgri Aug 1, 2022
4abaa1d
Update tests/unit/test_hnswlib.cpp
meiravgri Aug 1, 2022
93df064
empty index and capacity = 0 tests. removed todo from hnsw_wrapper
meiravgri Aug 1, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 20 additions & 14 deletions src/VecSim/algorithms/brute_force/brute_force.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,22 @@ int BruteForceIndex::addVector(const void *vector_data, size_t label) {
if (this->deletedIds.size() != 0) {
id = *this->deletedIds.begin();
this->deletedIds.erase(this->deletedIds.begin());
this->count++;
} else {
id = this->count++;
id = count;
// Save current id2vec size.
size_t ids_mapping_size = idToVectorBlockMemberMapping.size();

// If it's full - resize the index to be a multiplication of block size.
if (id >= ids_mapping_size) {
size_t last_block_vectors_count = count % vectorBlockSize;
this->idToVectorBlockMemberMapping.resize(ids_mapping_size + vectorBlockSize -
last_block_vectors_count);
}
}
}

// See if new id is bigger than current vector count. Needs to resize the index.
if (id >= this->idToVectorBlockMemberMapping.size()) {
this->idToVectorBlockMemberMapping.resize(std::ceil(this->count * 1.1));
}
// Anyway - increse count.
++count;

// Get vector block to store the vector in.
VectorBlock *vectorBlock;
Expand Down Expand Up @@ -127,7 +133,7 @@ int BruteForceIndex::deleteVector(size_t label) {
idType id;
auto optionalId = this->labelToIdLookup.find(label);
if (optionalId == this->labelToIdLookup.end()) {
// Nothing to delete;
// Nothing to delete.
return true;
} else {
id = optionalId->second;
Expand All @@ -142,21 +148,21 @@ int BruteForceIndex::deleteVector(size_t label) {
VectorBlockMember *lastVectorBlockMember =
lastVectorBlock->getMember(lastVectorBlock->getLength() - 1);

// Swap the last vector with the deleted vector;
// Swap the last vector with the deleted vector.
vectorBlock->setMember(vectorIndex, lastVectorBlockMember);

float *destination = vectorBlock->getVector(vectorIndex);
float *origin = lastVectorBlock->removeAndFetchVector();
memmove(destination, origin, sizeof(float) * this->dim);

// Delete the vector block membership
// Delete the vector block membership.
delete vectorBlockMember;
this->idToVectorBlockMemberMapping[id] = NULL;
// Add deleted id to reusable ids.
this->deletedIds.emplace(id);
this->labelToIdLookup.erase(label);

// If the last vector block is emtpy;
// If the last vector block is emtpy.
if (lastVectorBlock->getLength() == 0) {
delete lastVectorBlock;
this->vectorBlocks.pop_back();
Expand Down Expand Up @@ -206,7 +212,7 @@ VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t
void *timeoutCtx = queryParams ? queryParams->timeoutCtx : NULL;

this->last_mode = STANDARD_KNN;
float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic
memcpy(normalized_blob, queryBlob, this->dim * sizeof(float));
Expand All @@ -216,7 +222,7 @@ VecSimQueryResult_List BruteForceIndex::topKQuery(const void *queryBlob, size_t

float upperBound = std::numeric_limits<float>::lowest();
vecsim_stl::max_priority_queue<pair<float, labelType>> TopCandidates(this->allocator);
// For every block, compute its vectors scores and update the Top candidates max heap
// For every block, compute its vectors scores and update the Top candidates max heap.
for (auto vectorBlock : this->vectorBlocks) {
auto scores = computeBlockScores(vectorBlock, queryBlob, timeoutCtx, &rl.code);
if (VecSim_OK != rl.code) {
Expand Down Expand Up @@ -256,7 +262,7 @@ VecSimQueryResult_List BruteForceIndex::rangeQuery(const void *queryBlob, float
void *timeoutCtx = queryParams ? queryParams->timeoutCtx : nullptr;
this->last_mode = RANGE_QUERY;

float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic when other types will be supported.
memcpy(normalized_blob, queryBlob, this->dim * sizeof(float));
Expand Down Expand Up @@ -299,7 +305,7 @@ VecSimIndexInfo BruteForceIndex::info() const {

VecSimInfoIterator *BruteForceIndex::infoIterator() {
VecSimIndexInfo info = this->info();
// For readability. Update this number when needed;
// For readability. Update this number when needed.
size_t numberOfInfoFields = 8;
VecSimInfoIterator *infoIterator = new VecSimInfoIterator(numberOfInfoFields);

Expand Down
50 changes: 38 additions & 12 deletions src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ size_t HNSWIndex::estimateInitialSize(const HNSWParams *params) {
sizeof(size_t);
est += sizeof(*hnsw) + sizeof(size_t);
est += sizeof(VisitedNodesHandler) + sizeof(size_t);
// used for synchronization only when parallel indexing / searching is enabled.
// Used for synchronization only when parallel indexing / searching is enabled.
#ifdef ENABLE_PARALLELIZATION
est += sizeof(VisitedNodesHandlerPool);
#endif
Expand All @@ -46,7 +46,7 @@ size_t HNSWIndex::estimateInitialSize(const HNSWParams *params) {
est += sizeof(void *) * params->initialCapacity + sizeof(size_t); // link lists (for levels > 0)
est += sizeof(size_t) * params->initialCapacity + sizeof(size_t); // element level
est += sizeof(size_t) * params->initialCapacity +
sizeof(size_t); // labels lookup hash table buckets
sizeof(size_t); // Labels lookup hash table buckets.

size_t size_links_level0 =
sizeof(linklistsizeint) + params->M * 2 * sizeof(tableint) + sizeof(void *);
Expand Down Expand Up @@ -92,16 +92,24 @@ size_t HNSWIndex::estimateElementMemory(const HNSWParams *params) {
}

int HNSWIndex::addVector(const void *vector_data, size_t id) {

// If id already exists remove and re-add
if (this->hnsw->isLabelExist(id)) {
this->hnsw->removePoint(id);
}

try {
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic
memcpy(normalized_data, vector_data, this->dim * sizeof(float));
float_vector_normalize(normalized_data, this->dim);
vector_data = normalized_data;
}
if (hnsw->getIndexSize() == this->hnsw->getIndexCapacity()) {
this->hnsw->resizeIndex(this->hnsw->getIndexCapacity() + this->blockSize);
size_t index_capacity = this->hnsw->getIndexCapacity();
if (hnsw->getIndexSize() == index_capacity) {
size_t vectors_to_add = blockSize - index_capacity % blockSize;
this->hnsw->resizeIndex(index_capacity + vectors_to_add);
}
this->hnsw->addPoint(vector_data, id);
return true;
Expand All @@ -111,11 +119,29 @@ int HNSWIndex::addVector(const void *vector_data, size_t id) {
}

int HNSWIndex::deleteVector(size_t id) {
bool res = this->hnsw->removePoint(id);
if (hnsw->getIndexSize() + this->blockSize <= this->hnsw->getIndexCapacity()) {
this->hnsw->resizeIndex(this->hnsw->getIndexCapacity() - this->blockSize);

// If id doesnt exist.
if (!this->hnsw->isLabelExist(id)) {
return false;
}
return res;

// Else, *delete* it from the graph.
this->hnsw->removePoint(id);

size_t index_size = hnsw->getIndexSize();
size_t curr_capacity = this->hnsw->getIndexCapacity();

// If we need to free a complete block & there is a least one block between the
// capacity and the size.
if (index_size % blockSize == 0 && index_size + blockSize <= curr_capacity) {

// Check if the capacity is aligned to block size.
size_t extra_space_to_free = curr_capacity % blockSize;

// Remove one block from the capacity.
this->hnsw->resizeIndex(curr_capacity - blockSize - extra_space_to_free);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about this TODO?

}
return true;
}

double HNSWIndex::getDistanceFrom(size_t label, const void *vector_data) {
Expand All @@ -132,7 +158,7 @@ VecSimQueryResult_List HNSWIndex::topKQuery(const void *query_data, size_t k,
void *timeoutCtx = nullptr;
try {
this->last_mode = STANDARD_KNN;
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine
float normalized_data[this->dim]; // This will be use only if metric == VecSimMetric_Cosine.
if (this->metric == VecSimMetric_Cosine) {
// TODO: need more generic
memcpy(normalized_data, query_data, this->dim * sizeof(float));
Expand All @@ -158,7 +184,7 @@ VecSimQueryResult_List HNSWIndex::topKQuery(const void *query_data, size_t k,
VecSimQueryResult_SetScore(rl.results[i], knn_res.top().first);
knn_res.pop();
}
// Restore efRuntime
// Restore efRuntime.
hnsw->setEf(originalEF);
assert(hnsw->getEf() == originalEF);

Expand Down Expand Up @@ -204,7 +230,7 @@ VecSimBatchIterator *HNSWIndex::newBatchIterator(const void *queryBlob,

VecSimInfoIterator *HNSWIndex::infoIterator() {
VecSimIndexInfo info = this->info();
// For readability. Update this number when needed;
// For readability. Update this number when needed.
size_t numberOfInfoFields = 12;
VecSimInfoIterator *infoIterator = new VecSimInfoIterator(numberOfInfoFields);

Expand Down
23 changes: 9 additions & 14 deletions src/VecSim/algorithms/hnsw/hnswlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,14 @@ class HierarchicalNSW : public VecsimBaseObject {
linklistsizeint *get_linklist_at_level(tableint internal_id, size_t level) const;
unsigned short int getListCount(const linklistsizeint *ptr) const;
void resizeIndex(size_t new_max_elements);
bool removePoint(labeltype label);
void removePoint(labeltype label);
void addPoint(const void *data_point, labeltype label);
dist_t getDistanceByLabelFromPoint(labeltype label, const void *data_point);
tableint searchBottomLayerEP(const void *query_data, void *timeoutCtx,
VecSimQueryResult_Code *rc) const;
vecsim_stl::max_priority_queue<pair<dist_t, labeltype>>
searchKnn(const void *query_data, size_t k, void *timeoutCtx, VecSimQueryResult_Code *rc) const;
bool isLabelExist(labeltype label);
};

/**
Expand Down Expand Up @@ -322,6 +323,10 @@ VisitedNodesHandler *HierarchicalNSW<dist_t>::getVisitedList() const {
return visited_nodes_handler.get();
}

template <typename dist_t>
bool HierarchicalNSW<dist_t>::isLabelExist(labeltype label) {
return (label_lookup_.find(label) != label_lookup_.end());
}
/**
* helper functions
*/
Expand Down Expand Up @@ -945,11 +950,8 @@ void HierarchicalNSW<dist_t>::resizeIndex(size_t new_max_elements) {
}

template <typename dist_t>
bool HierarchicalNSW<dist_t>::removePoint(const labeltype label) {
// check that the label actually exists in the graph, and update the number of elements.
if (label_lookup_.find(label) == label_lookup_.end()) {
return true;
}
void HierarchicalNSW<dist_t>::removePoint(const labeltype label) {

tableint element_internal_id = label_lookup_[label];
vecsim_stl::vector<bool> neighbours_bitmap(allocator);

Expand Down Expand Up @@ -1029,7 +1031,6 @@ bool HierarchicalNSW<dist_t>::removePoint(const labeltype label) {
} else {
SwapLastIdWithDeletedId(element_internal_id, last_element_internal_id);
}
return true;
}

template <typename dist_t>
Expand All @@ -1041,13 +1042,7 @@ void HierarchicalNSW<dist_t>::addPoint(const void *data_point, const labeltype l
#ifdef ENABLE_PARALLELIZATION
std::unique_lock<std::mutex> templock_curr(cur_element_count_guard_);
#endif
// Checking if an element with the given label already exists. if so, remove it.
if (label_lookup_.find(label) != label_lookup_.end()) {
removePoint(label);
}
if (cur_element_count >= max_elements_) {
throw std::runtime_error("The number of elements exceeds the specified limit");
}

cur_c = max_id = cur_element_count++;
label_lookup_[label] = cur_c;
}
Expand Down
Loading