diff --git a/src/VecSim/algorithms/brute_force/brute_force.cpp b/src/VecSim/algorithms/brute_force/brute_force.cpp index d6eb042e8..0f6a97043 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.cpp +++ b/src/VecSim/algorithms/brute_force/brute_force.cpp @@ -157,13 +157,7 @@ double BruteForceIndex::getDistanceFrom(size_t label, const void *vector_data) { } idType id = optionalId->second; VectorBlockMember *vector_index = this->idToVectorBlockMemberMapping[id]; - float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine - if (this->metric == VecSimMetric_Cosine) { - // TODO: need more generic - memcpy(normalized_blob, vector_data, this->dim * sizeof(float)); - float_vector_normalize(normalized_blob, this->dim); - vector_data = normalized_blob; - } + return this->dist_func(vector_index->block->getVector(vector_index->index), vector_data, &this->dim); } diff --git a/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp b/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp index 53f545de2..de2e48f0d 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp +++ b/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp @@ -83,13 +83,6 @@ VecSimResolveCode HNSWIndex::resolveParams(VecSimRawParam *rparams, int paramNum } double HNSWIndex::getDistanceFrom(size_t label, const void *vector_data) { - if (this->metric == VecSimMetric_Cosine) { - // TODO: need more generic - float normalized_data[this->dim]; - memcpy(normalized_data, vector_data, this->dim * sizeof(float)); - float_vector_normalize(normalized_data, this->dim); - return this->hnsw->getDistanceByLabelFromPoint(label, normalized_data); - } return this->hnsw->getDistanceByLabelFromPoint(label, vector_data); } diff --git a/src/VecSim/vec_sim.cpp b/src/VecSim/vec_sim.cpp index 656dc15df..f7fc7bf9e 100644 --- a/src/VecSim/vec_sim.cpp +++ b/src/VecSim/vec_sim.cpp @@ -46,6 +46,12 @@ extern "C" double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t id, con return index->getDistanceFrom(id, blob); } +extern "C" void VecSim_Normalize(void *blob, size_t dim, VecSimType type) { + // TODO: need more generic + assert(type == VecSimType_FLOAT32); + float_vector_normalize((float *)blob, dim); +} + extern "C" size_t VecSimIndex_IndexSize(VecSimIndex *index) { return index->indexSize(); } extern "C" VecSimResolveCode VecSimIndex_ResolveParams(VecSimIndex *index, VecSimRawParam *rparams, diff --git a/src/VecSim/vec_sim.h b/src/VecSim/vec_sim.h index 3fd5dc75f..791af90be 100644 --- a/src/VecSim/vec_sim.h +++ b/src/VecSim/vec_sim.h @@ -44,17 +44,28 @@ int VecSimIndex_AddVector(VecSimIndex *index, const void *blob, size_t id); int VecSimIndex_DeleteVector(VecSimIndex *index, size_t id); /** - * @brief Calculate the distance of a vector from an index to a vector. + * @brief Calculate the distance of a vector from an index to a vector. This function assumes that + * the vector fits the index - its type and dimension are the same as the index's, and if the + * index's distance metric is cosine, the vector is already normalized. * @param index the index from which the first vector is located, and that defines the distance * metric. * @param id the id of the vector in the index. * @param blob binary representation of the second vector. Blob size should match the index data - * type and dimension. + * type and dimension, and pre-normalized if needed. * @return The distance (according to the index's distance metric) between `blob` and the vector * with id `id`. */ double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t id, const void *blob); +/** + * @brief normalize the vector blob in place. + * @param blob binary representation of a vector. Blob size should match the specified type and + * dimension. + * @param dim vector dimension. + * @param type vector type. + */ +void VecSim_Normalize(void *blob, size_t dim, VecSimType type); + /** * @brief Return the number of vectors in the index. * @param index the index whose size is requested. diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h index 54818ab1f..ad11fd42f 100644 --- a/src/VecSim/vec_sim_index.h +++ b/src/VecSim/vec_sim_index.h @@ -47,7 +47,7 @@ struct VecSimIndex : public VecsimBaseObject { * metric. * @param id the id of the vector in the index. * @param blob binary representation of the second vector. Blob size should match the index data - * type and dimension. + * type and dimension, and pre-normalized if needed. * @return The distance (according to the index's distance metric) between `blob` and the vector * with id `id`. */ diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 3461b4473..df285518c 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -971,6 +971,10 @@ TEST_F(BruteForceTest, brute_get_distance) { } void *query = v1; + void *norm = v2; // {e, e} + VecSim_Normalize(norm, dim, VecSimType_FLOAT32); // now {1/sqrt(2), 1/sqrt(2)} + ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f / sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f / sqrt(2.0f)); double dist; // VecSimMetric_L2 @@ -990,12 +994,12 @@ TEST_F(BruteForceTest, brute_get_distance) { // VecSimMetric_Cosine distances = {5.9604644775390625e-08, 5.9604644775390625e-08, 0.0025991201400756836, 1}; for (size_t i = 0; i < n; i++) { - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, norm); ASSERT_DOUBLE_EQ(dist, distances[i]); } // Bad values - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, norm); ASSERT_TRUE(std::isnan(dist)); dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_L2], 46, query); ASSERT_TRUE(std::isnan(dist)); diff --git a/tests/unit/test_hnswlib.cpp b/tests/unit/test_hnswlib.cpp index 8a97b2455..82bc6f498 100644 --- a/tests/unit/test_hnswlib.cpp +++ b/tests/unit/test_hnswlib.cpp @@ -1140,6 +1140,10 @@ TEST_F(HNSWLibTest, hnsw_get_distance) { } void *query = v1; + void *norm = v2; // {e, e} + VecSim_Normalize(norm, dim, VecSimType_FLOAT32); // now {1/sqrt(2), 1/sqrt(2)} + ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f / sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f / sqrt(2.0f)); double dist; // VecSimMetric_L2 @@ -1159,12 +1163,12 @@ TEST_F(HNSWLibTest, hnsw_get_distance) { // VecSimMetric_Cosine distances = {5.9604644775390625e-08, 5.9604644775390625e-08, 0.0025991201400756836, 1}; for (size_t i = 0; i < n; i++) { - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, norm); ASSERT_DOUBLE_EQ(dist, distances[i]); } // Bad values - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, norm); ASSERT_TRUE(std::isnan(dist)); dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_L2], 46, query); ASSERT_TRUE(std::isnan(dist));