From 14ffd8ad3fe730ca686f6363e52a2454a3a88638 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Sun, 7 May 2023 18:20:58 +0300 Subject: [PATCH 1/5] fixed a bug in preferAdHocSearch of tiered --- src/VecSim/algorithms/hnsw/hnsw_tiered.h | 6 ------ src/VecSim/vec_sim_tiered_index.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index e6acc3519..55eed81e3 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -189,12 +189,6 @@ class TieredHNSWIndex : public VecSimTieredIndex { return new (this->allocator) TieredHNSW_BatchIterator(queryBlobCopy, this, queryParams, this->allocator); } - bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override { - // For now, decide according to the bigger index. - return this->backendIndex->indexSize() > this->frontendIndex->indexSize() - ? this->backendIndex->preferAdHocSearch(subsetSize, k, initial_check) - : this->frontendIndex->preferAdHocSearch(subsetSize, k, initial_check); - } inline void setLastSearchMode(VecSearchMode mode) override { return this->backendIndex->setLastSearchMode(mode); } diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h index b950d18c5..387d9a693 100644 --- a/src/VecSim/vec_sim_tiered_index.h +++ b/src/VecSim/vec_sim_tiered_index.h @@ -69,6 +69,17 @@ class VecSimTieredIndex : public VecSimIndexInterface { VecSimQueryParams *queryParams, VecSimQueryResult_Order order) override; + bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override { + // For now, decide according to the bigger index. + size_t frontend_size = this->frontendIndex->indexSize(); + size_t backend_size = this->backendIndex->indexSize(); + return backend_size > frontend_size + ? this->backendIndex->preferAdHocSearch(std::min(subsetSize, backend_size), k, + initial_check) + : this->frontendIndex->preferAdHocSearch(std::min(subsetSize, frontend_size), k, + initial_check); + } + // Return the current state of the global write mode (async/in-place). static VecSimWriteMode getWriteMode() { return VecSimIndexInterface::asyncWriteMode; } From f75c867609daded5429bd00cd29c3fe79a04a524 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Sun, 7 May 2023 18:21:08 +0300 Subject: [PATCH 2/5] added a test --- .../hnsw/hnsw_tiered_tests_friends.h | 1 + tests/unit/test_hnsw_tiered.cpp | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h index 7092615cf..189a0e439 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h @@ -44,3 +44,4 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMulti_Test) INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_deleteVectorMultiFromFlatAdvanced_Test) INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_overwriteVectorBasic_Test) INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_overwriteVectorAsync_Test) +INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_preferAdHocOptimization_Test) diff --git a/tests/unit/test_hnsw_tiered.cpp b/tests/unit/test_hnsw_tiered.cpp index 23ca3ff7c..689bae8d8 100644 --- a/tests/unit/test_hnsw_tiered.cpp +++ b/tests/unit/test_hnsw_tiered.cpp @@ -3374,3 +3374,45 @@ TYPED_TEST(HNSWTieredIndexTest, parallelRangeSearch) { // Cleanup. delete index_ctx; } + +TYPED_TEST(HNSWTieredIndexTestBasic, preferAdHocOptimization) { + size_t dim = 4; + + HNSWParams params = { + .type = TypeParam::get_index_type(), + .dim = dim, + .metric = VecSimMetric_L2, + }; + VecSimParams hnsw_params = CreateParams(params); + auto jobQ = JobQueue(); + auto index_ctx = new IndexExtCtx(); + size_t memory_ctx = 0; + + // Create tiered index with buffer limit set to 0. + auto *tiered_index = this->CreateTieredHNSWIndex(hnsw_params, &jobQ, index_ctx, &memory_ctx); + auto allocator = tiered_index->getAllocator(); + + auto hnsw = tiered_index->backendIndex; + auto flat = tiered_index->frontendIndex; + + // Insert 5 vectors to the main index. + for (size_t i = 0; i < 5; i++) { + GenerateAndAddVector(hnsw, dim, i, i); + } + // Sanity check. Should choose as HNSW. + ASSERT_EQ(tiered_index->preferAdHocSearch(5, 5, true), hnsw->preferAdHocSearch(5, 5, true)); + + // Insert 6 vectors to the flat index. + for (size_t i = 0; i < 6; i++) { + GenerateAndAddVector(flat, dim, i, i); + } + // Sanity check. Should choose as flat as it has more vectors. + ASSERT_EQ(tiered_index->preferAdHocSearch(5, 5, true), flat->preferAdHocSearch(5, 5, true)); + + // Check for preference of tiered with subset (10) smaller than the tiered index size (11), + // but larger than any of the underlying indexes. + ASSERT_NO_THROW(tiered_index->preferAdHocSearch(10, 5, false)); + + // Cleanup. + delete index_ctx; +} From 887ce0717701691fdef063c5266dce07a70e29f5 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Sun, 7 May 2023 19:11:46 +0300 Subject: [PATCH 3/5] made it possible to query for preference of too big subset size --- src/VecSim/algorithms/brute_force/brute_force.h | 6 +++--- src/VecSim/algorithms/hnsw/hnsw.h | 6 +++--- src/VecSim/vec_sim_tiered_index.h | 10 +++------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index d47b6dd70..8121b5d60 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -422,9 +422,9 @@ bool BruteForceIndex::preferAdHocSearch(size_t subsetSize, s // This heuristic is based on sklearn decision tree classifier (with 10 leaves nodes) - // see scripts/BF_batches_clf.py size_t index_size = this->indexSize(); - if (subsetSize > index_size) { - throw std::runtime_error("internal error: subset size cannot be larger than index size"); - } + // Referring to too large subset size as if it was the maximum possible size. + subsetSize = std::min(subsetSize, index_size); + size_t d = this->dim; float r = (index_size == 0) ? 0.0f : (float)(subsetSize) / (float)this->indexLabelCount(); bool res; diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 78516a193..a3b08caa5 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -2249,9 +2249,9 @@ bool HNSWIndex::preferAdHocSearch(size_t subsetSize, size_t // This heuristic is based on sklearn decision tree classifier (with 20 leaves nodes) - // see scripts/HNSW_batches_clf.py size_t index_size = this->indexSize(); - if (subsetSize > index_size) { - throw std::runtime_error("internal error: subset size cannot be larger than index size"); - } + // Referring to too large subset size as if it was the maximum possible size. + subsetSize = std::min(subsetSize, index_size); + size_t d = this->dim; size_t M = this->getM(); float r = (index_size == 0) ? 0.0f : (float)(subsetSize) / (float)this->indexLabelCount(); diff --git a/src/VecSim/vec_sim_tiered_index.h b/src/VecSim/vec_sim_tiered_index.h index 387d9a693..bba8f8752 100644 --- a/src/VecSim/vec_sim_tiered_index.h +++ b/src/VecSim/vec_sim_tiered_index.h @@ -71,13 +71,9 @@ class VecSimTieredIndex : public VecSimIndexInterface { bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) override { // For now, decide according to the bigger index. - size_t frontend_size = this->frontendIndex->indexSize(); - size_t backend_size = this->backendIndex->indexSize(); - return backend_size > frontend_size - ? this->backendIndex->preferAdHocSearch(std::min(subsetSize, backend_size), k, - initial_check) - : this->frontendIndex->preferAdHocSearch(std::min(subsetSize, frontend_size), k, - initial_check); + return this->backendIndex->indexSize() > this->frontendIndex->indexSize() + ? this->backendIndex->preferAdHocSearch(subsetSize, k, initial_check) + : this->frontendIndex->preferAdHocSearch(subsetSize, k, initial_check); } // Return the current state of the global write mode (async/in-place). From e31b7a3dac85b1805cb74afb7cbdee95d2b5a665 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Sun, 7 May 2023 19:12:05 +0300 Subject: [PATCH 4/5] fix tests --- tests/unit/test_bruteforce.cpp | 9 ++------- tests/unit/test_hnsw.cpp | 9 ++------- tests/unit/test_hnsw_multi.cpp | 9 ++------- 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 870485a87..69d6fe7f8 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -1238,13 +1238,8 @@ TYPED_TEST(BruteForceTest, preferAdHocOptimization) { ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); // Corner cases - subset size is greater than index size. - try { - VecSimIndex_PreferAdHocSearch(index, 1, 50, true); - FAIL() << "Expected std::runtime error"; - } catch (std::runtime_error const &err) { - EXPECT_EQ(err.what(), - std::string("internal error: subset size cannot be larger than index size")); - } + ASSERT_NO_THROW(VecSimIndex_PreferAdHocSearch(index, 1, 50, true)); + VecSimIndex_Free(index); } diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp index 0362523b0..0f83003f4 100644 --- a/tests/unit/test_hnsw.cpp +++ b/tests/unit/test_hnsw.cpp @@ -1503,13 +1503,8 @@ TYPED_TEST(HNSWTest, preferAdHocOptimization) { ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); // Corner cases - subset size is greater than index size. - try { - VecSimIndex_PreferAdHocSearch(index, 1, 50, true); - FAIL() << "Expected std::runtime error"; - } catch (std::runtime_error const &err) { - EXPECT_EQ(err.what(), - std::string("internal error: subset size cannot be larger than index size")); - } + ASSERT_NO_THROW(VecSimIndex_PreferAdHocSearch(index, 1, 50, true)); + VecSimIndex_Free(index); } diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp index 158262229..cf596e1af 100644 --- a/tests/unit/test_hnsw_multi.cpp +++ b/tests/unit/test_hnsw_multi.cpp @@ -627,13 +627,8 @@ TYPED_TEST(HNSWMultiTest, preferAdHocOptimization) { ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); // Corner cases - subset size is greater than index size. - try { - VecSimIndex_PreferAdHocSearch(index, 1, 50, true); - FAIL() << "Expected std::runtime error"; - } catch (std::runtime_error const &err) { - EXPECT_EQ(err.what(), - std::string("internal error: subset size cannot be larger than index size")); - } + ASSERT_NO_THROW(VecSimIndex_PreferAdHocSearch(index, 1, 50, true)); + VecSimIndex_Free(index); } TYPED_TEST(HNSWMultiTest, search_empty_index) { From 7aa5e3dbc670a8367a3e73c0479cdc7422a2f8f5 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Sun, 7 May 2023 19:32:16 +0300 Subject: [PATCH 5/5] review fix --- tests/unit/test_bruteforce.cpp | 3 ++- tests/unit/test_hnsw.cpp | 3 ++- tests/unit/test_hnsw_multi.cpp | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 69d6fe7f8..0ae0ed9db 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -1238,7 +1238,8 @@ TYPED_TEST(BruteForceTest, preferAdHocOptimization) { ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); // Corner cases - subset size is greater than index size. - ASSERT_NO_THROW(VecSimIndex_PreferAdHocSearch(index, 1, 50, true)); + ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true), + VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); VecSimIndex_Free(index); } diff --git a/tests/unit/test_hnsw.cpp b/tests/unit/test_hnsw.cpp index 0f83003f4..f2e73e15f 100644 --- a/tests/unit/test_hnsw.cpp +++ b/tests/unit/test_hnsw.cpp @@ -1503,7 +1503,8 @@ TYPED_TEST(HNSWTest, preferAdHocOptimization) { ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); // Corner cases - subset size is greater than index size. - ASSERT_NO_THROW(VecSimIndex_PreferAdHocSearch(index, 1, 50, true)); + ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true), + VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); VecSimIndex_Free(index); } diff --git a/tests/unit/test_hnsw_multi.cpp b/tests/unit/test_hnsw_multi.cpp index cf596e1af..55b81398e 100644 --- a/tests/unit/test_hnsw_multi.cpp +++ b/tests/unit/test_hnsw_multi.cpp @@ -627,7 +627,8 @@ TYPED_TEST(HNSWMultiTest, preferAdHocOptimization) { ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); // Corner cases - subset size is greater than index size. - ASSERT_NO_THROW(VecSimIndex_PreferAdHocSearch(index, 1, 50, true)); + ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true), + VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); VecSimIndex_Free(index); }