From c02982a11f988cbaa742eae549a2ed49e83c226e Mon Sep 17 00:00:00 2001 From: ofiryanai Date: Sun, 19 Apr 2026 18:48:23 +0300 Subject: [PATCH] MOD-14916 Devirtualize HNSW / brute-force search hot path (#937) * MOD-14916 Devirtualize distance + getElement on HNSW search hot path MOD-14916 / LTK perf investigation. Two virtual dispatches per HNSW candidate added between v2.10.21 and v8.2.6 account for a measurable share of the KNN regression observed in the LTK benchmarks (-38% throughput on Intel). Both are removed here with the minimum possible change. V1 - distance computation: Every calcDistance() call goes through IndexCalculatorInterface's vtable to reach DistanceCalculatorCommon, which then calls the underlying SIMD function pointer. The intermediate vtable hop is pure indirection; the concrete calculator class is fixed for the life of an index. Expose the underlying dist_func via a new pure-virtual getDistFunc() on IndexCalculatorInterface, implemented by DistanceCalculatorCommon. Cache the returned function pointer in VecSimIndexAbstract at construction time and call it directly in calcDistance(), bypassing the virtual dispatch. V2 - vector fetch: HNSWIndex::getDataByInternalId and BruteForceIndex::getDataByInternalId call this->vectors->getElement(id), which is virtual through the RawDataContainer base. DataBlocksContainer is the only concrete implementation, and this->vectors is always a DataBlocksContainer (created and owned by VecSimIndexAbstract's constructor). Use a static_cast to DataBlocksContainer* plus a qualified call to DataBlocksContainer::getElement to skip the vtable lookup. No behavior change; per-candidate distance and neighbor-fetch calls on HNSW / brute force search paths become direct function-pointer / direct-member calls. Headers in index_factories, hnsw_serializer, and brute_force_factory compile cleanly. * MOD-14916 Inline DataBlocksContainer::getElement on HNSW search hot path Follow-up to the previous V1/V2 devirt commit. The static_cast+qualified call in getDataByInternalId removed the vtable lookup but left the larger cost on the table: DataBlocksContainer::getElement was still defined in data_blocks_container.cpp, so every per-candidate neighbor fetch still paid a real out-of-line function call and a bounds-checked blocks.at() lookup. Without LTO the compiler could neither inline the body nor hoist the div/mod in the HNSW hot loop. Move the definition into the header as inline and drop the .at() bounds check to match the v2.10.21 baseline, which used unchecked operator[] and was fully inlined into processCandidate. Also add a getDistFunc() override to DistanceCalculatorDummy in test_components.cpp so BUILD_TESTS still compiles after the pure virtual added in the previous commit. (cherry picked from commit 4ca500a6fbac0fde659717f726e8c23442c37e7c) --- src/VecSim/algorithms/brute_force/brute_force.h | 5 ++++- src/VecSim/algorithms/hnsw/hnsw.h | 4 +++- src/VecSim/containers/data_blocks_container.cpp | 5 ----- src/VecSim/containers/data_blocks_container.h | 6 +++++- src/VecSim/spaces/computer/calculator.h | 5 +++++ src/VecSim/vec_sim_index.h | 16 +++++++++++++--- tests/unit/test_components.cpp | 3 +++ 7 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index 23bd142de..d508ffb18 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -43,7 +43,10 @@ class BruteForceIndex : public VecSimIndexAbstract { size_t indexCapacity() const override; std::unique_ptr getVectorsIterator() const; const DataType *getDataByInternalId(idType id) const { - return reinterpret_cast(this->vectors->getElement(id)); + // `vectors` is always a DataBlocksContainer; skip the RawDataContainer vtable. + return reinterpret_cast( + static_cast(this->vectors) + ->DataBlocksContainer::getElement(id)); } VecSimQueryReply *topKQuery(const void *queryBlob, size_t k, VecSimQueryParams *queryParams) const override; diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index 2c629afa4..cbe4370a3 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -384,7 +384,9 @@ labelType HNSWIndex::getEntryPointLabel() const { template const char *HNSWIndex::getDataByInternalId(idType internal_id) const { - return this->vectors->getElement(internal_id); + // `vectors` is always a DataBlocksContainer; skip the RawDataContainer vtable on the hot path. + return static_cast(this->vectors) + ->DataBlocksContainer::getElement(internal_id); } template diff --git a/src/VecSim/containers/data_blocks_container.cpp b/src/VecSim/containers/data_blocks_container.cpp index bf63b683a..f6010b7db 100644 --- a/src/VecSim/containers/data_blocks_container.cpp +++ b/src/VecSim/containers/data_blocks_container.cpp @@ -38,11 +38,6 @@ RawDataContainer::Status DataBlocksContainer::addElement(const void *element, si return Status::OK; } -const char *DataBlocksContainer::getElement(size_t id) const { - assert(id < element_count); - return blocks.at(id / this->block_size).getElement(id % this->block_size); -} - RawDataContainer::Status DataBlocksContainer::removeElement(size_t id) { assert(id == element_count - 1); // only the last element can be removed blocks.back().popLastElement(); diff --git a/src/VecSim/containers/data_blocks_container.h b/src/VecSim/containers/data_blocks_container.h index c375590f2..74f66de11 100644 --- a/src/VecSim/containers/data_blocks_container.h +++ b/src/VecSim/containers/data_blocks_container.h @@ -40,7 +40,11 @@ class DataBlocksContainer : public VecsimBaseObject, public RawDataContainer { Status addElement(const void *element, size_t id) override; - const char *getElement(size_t id) const override; + // Inlined so the hot search path (via getDataByInternalId) can fold the indexing arithmetic. + const char *getElement(size_t id) const override { + assert(id < element_count); + return blocks[id / block_size].getElement(id % block_size); + } Status removeElement(size_t id) override; diff --git a/src/VecSim/spaces/computer/calculator.h b/src/VecSim/spaces/computer/calculator.h index a82293700..539325d8e 100644 --- a/src/VecSim/spaces/computer/calculator.h +++ b/src/VecSim/spaces/computer/calculator.h @@ -23,6 +23,9 @@ class IndexCalculatorInterface : public VecsimBaseObject { virtual ~IndexCalculatorInterface() = default; virtual DistType calcDistance(const void *v1, const void *v2, size_t dim) const = 0; + + // Raw distance function; cached by the index to skip the vtable on the hot path. + virtual spaces::dist_func_t getDistFunc() const = 0; }; /** @@ -56,4 +59,6 @@ class DistanceCalculatorCommon DistType calcDistance(const void *v1, const void *v2, size_t dim) const override { return this->dist_func(v1, v2, dim); } + + spaces::dist_func_t getDistFunc() const override { return this->dist_func; } }; diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h index 8d9543e37..315dc0c30 100644 --- a/src/VecSim/vec_sim_index.h +++ b/src/VecSim/vec_sim_index.h @@ -84,6 +84,7 @@ struct VecSimIndexAbstract : public VecSimIndexInterface { RawDataContainer *vectors; // The raw vectors data container. private: IndexCalculatorInterface *indexCalculator; // Distance calculator. + spaces::dist_func_t cachedDistFunc; // Cached dist func, used on the hot path. PreprocessorsContainerAbstract *preprocessors; // Storage and query preprocessors. size_t inputBlobSize; // The size of input vectors/queries blob in bytes. May differ from dim * @@ -120,8 +121,11 @@ struct VecSimIndexAbstract : public VecSimIndexInterface { metric(params.metric), blockSize(params.blockSize ? params.blockSize : DEFAULT_BLOCK_SIZE), lastMode(EMPTY_MODE), isMulti(params.multi), logCallbackCtx(params.logCtx), - indexCalculator(components.indexCalculator), preprocessors(components.preprocessors), - inputBlobSize(params.inputBlobSize), storedDataSize(params.storedDataSize) { + indexCalculator(components.indexCalculator), + cachedDistFunc(components.indexCalculator ? components.indexCalculator->getDistFunc() + : nullptr), + preprocessors(components.preprocessors), inputBlobSize(params.inputBlobSize), + storedDataSize(params.storedDataSize) { assert(VecSimType_sizeof(vecType)); assert(storedDataSize); assert(inputBlobSize); @@ -142,10 +146,16 @@ struct VecSimIndexAbstract : public VecSimIndexInterface { /** * @brief Calculate the distance between two vectors based on index parameters. * + * Uses the cached dist func to avoid the indexCalculator vtable on the hot path. + * + * @note Precondition: @c cachedDistFunc must be non-null. Subclasses that construct + * this index with a null @c indexCalculator (e.g. SVS, which uses its own + * internal distance kernels) must not call this method. + * * @return the distance between the vectors. */ DistType calcDistance(const void *vector_data1, const void *vector_data2) const { - return indexCalculator->calcDistance(vector_data1, vector_data2, this->dim); + return cachedDistFunc(vector_data1, vector_data2, this->dim); } /** diff --git a/tests/unit/test_components.cpp b/tests/unit/test_components.cpp index c7c1b0879..1eeaa3adb 100644 --- a/tests/unit/test_components.cpp +++ b/tests/unit/test_components.cpp @@ -31,6 +31,9 @@ class DistanceCalculatorDummy : public DistanceCalculatorInterfacedist_func(7); } + + // Dummy uses a non-standard dist func signature, so the standard slot is unavailable. + spaces::dist_func_t getDistFunc() const override { return nullptr; } }; } // namespace dummyCalcultor