From 99dbfdec4794022d545103da6b529640ad0bf551 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 10 Mar 2022 13:47:32 +0200 Subject: [PATCH 1/6] exported normalization function --- src/VecSim/vec_sim.cpp | 6 ++++++ src/VecSim/vec_sim.h | 11 ++++++++++- src/VecSim/vec_sim_index.h | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/VecSim/vec_sim.cpp b/src/VecSim/vec_sim.cpp index 656dc15df..f7fc7bf9e 100644 --- a/src/VecSim/vec_sim.cpp +++ b/src/VecSim/vec_sim.cpp @@ -46,6 +46,12 @@ extern "C" double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t id, con return index->getDistanceFrom(id, blob); } +extern "C" void VecSim_Normalize(void *blob, size_t dim, VecSimType type) { + // TODO: need more generic + assert(type == VecSimType_FLOAT32); + float_vector_normalize((float *)blob, dim); +} + extern "C" size_t VecSimIndex_IndexSize(VecSimIndex *index) { return index->indexSize(); } extern "C" VecSimResolveCode VecSimIndex_ResolveParams(VecSimIndex *index, VecSimRawParam *rparams, diff --git a/src/VecSim/vec_sim.h b/src/VecSim/vec_sim.h index 3fd5dc75f..7947da1b2 100644 --- a/src/VecSim/vec_sim.h +++ b/src/VecSim/vec_sim.h @@ -49,12 +49,21 @@ int VecSimIndex_DeleteVector(VecSimIndex *index, size_t id); * metric. * @param id the id of the vector in the index. * @param blob binary representation of the second vector. Blob size should match the index data - * type and dimension. + * type and dimension, and pre-normalized if needed. * @return The distance (according to the index's distance metric) between `blob` and the vector * with id `id`. */ double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t id, const void *blob); +/** + * @brief normalize the vector blob. + * @param blob binary representation of a vector. Blob size should match the specified type and + * dimension. + * @param dim vector dimension. + * @param type vector type. + */ +void VecSim_Normalize(void *blob, size_t dim, VecSimType type); + /** * @brief Return the number of vectors in the index. * @param index the index whose size is requested. diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h index 54818ab1f..ad11fd42f 100644 --- a/src/VecSim/vec_sim_index.h +++ b/src/VecSim/vec_sim_index.h @@ -47,7 +47,7 @@ struct VecSimIndex : public VecsimBaseObject { * metric. * @param id the id of the vector in the index. * @param blob binary representation of the second vector. Blob size should match the index data - * type and dimension. + * type and dimension, and pre-normalized if needed. * @return The distance (according to the index's distance metric) between `blob` and the vector * with id `id`. */ From 49a530ebb280613c7a789648a07bec2f785d0b51 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 10 Mar 2022 13:48:25 +0200 Subject: [PATCH 2/6] removed normalization step from getDistanceFrom --- src/VecSim/algorithms/brute_force/brute_force.cpp | 8 +------- src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp | 7 ------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/src/VecSim/algorithms/brute_force/brute_force.cpp b/src/VecSim/algorithms/brute_force/brute_force.cpp index d6eb042e8..0f6a97043 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.cpp +++ b/src/VecSim/algorithms/brute_force/brute_force.cpp @@ -157,13 +157,7 @@ double BruteForceIndex::getDistanceFrom(size_t label, const void *vector_data) { } idType id = optionalId->second; VectorBlockMember *vector_index = this->idToVectorBlockMemberMapping[id]; - float normalized_blob[this->dim]; // This will be use only if metric == VecSimMetric_Cosine - if (this->metric == VecSimMetric_Cosine) { - // TODO: need more generic - memcpy(normalized_blob, vector_data, this->dim * sizeof(float)); - float_vector_normalize(normalized_blob, this->dim); - vector_data = normalized_blob; - } + return this->dist_func(vector_index->block->getVector(vector_index->index), vector_data, &this->dim); } diff --git a/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp b/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp index 53f545de2..de2e48f0d 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp +++ b/src/VecSim/algorithms/hnsw/hnsw_wrapper.cpp @@ -83,13 +83,6 @@ VecSimResolveCode HNSWIndex::resolveParams(VecSimRawParam *rparams, int paramNum } double HNSWIndex::getDistanceFrom(size_t label, const void *vector_data) { - if (this->metric == VecSimMetric_Cosine) { - // TODO: need more generic - float normalized_data[this->dim]; - memcpy(normalized_data, vector_data, this->dim * sizeof(float)); - float_vector_normalize(normalized_data, this->dim); - return this->hnsw->getDistanceByLabelFromPoint(label, normalized_data); - } return this->hnsw->getDistanceByLabelFromPoint(label, vector_data); } From 1c86a618b24690b78a11abac87a05379250c5b9d Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 10 Mar 2022 13:48:33 +0200 Subject: [PATCH 3/6] updated tests --- tests/unit/test_bruteforce.cpp | 6 ++++-- tests/unit/test_hnswlib.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 3461b4473..4e67a3ca2 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -971,6 +971,8 @@ TEST_F(BruteForceTest, brute_get_distance) { } void *query = v1; + void *norm = v2; + VecSim_Normalize(norm, dim, VecSimType_FLOAT32); double dist; // VecSimMetric_L2 @@ -990,12 +992,12 @@ TEST_F(BruteForceTest, brute_get_distance) { // VecSimMetric_Cosine distances = {5.9604644775390625e-08, 5.9604644775390625e-08, 0.0025991201400756836, 1}; for (size_t i = 0; i < n; i++) { - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, norm); ASSERT_DOUBLE_EQ(dist, distances[i]); } // Bad values - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, norm); ASSERT_TRUE(std::isnan(dist)); dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_L2], 46, query); ASSERT_TRUE(std::isnan(dist)); diff --git a/tests/unit/test_hnswlib.cpp b/tests/unit/test_hnswlib.cpp index 8a97b2455..2f4635239 100644 --- a/tests/unit/test_hnswlib.cpp +++ b/tests/unit/test_hnswlib.cpp @@ -1140,6 +1140,8 @@ TEST_F(HNSWLibTest, hnsw_get_distance) { } void *query = v1; + void *norm = v2; + VecSim_Normalize(norm, dim, VecSimType_FLOAT32); double dist; // VecSimMetric_L2 @@ -1159,12 +1161,12 @@ TEST_F(HNSWLibTest, hnsw_get_distance) { // VecSimMetric_Cosine distances = {5.9604644775390625e-08, 5.9604644775390625e-08, 0.0025991201400756836, 1}; for (size_t i = 0; i < n; i++) { - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], i + 1, norm); ASSERT_DOUBLE_EQ(dist, distances[i]); } // Bad values - dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, query); + dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_Cosine], 0, norm); ASSERT_TRUE(std::isnan(dist)); dist = VecSimIndex_GetDistanceFrom(index[VecSimMetric_L2], 46, query); ASSERT_TRUE(std::isnan(dist)); From ca6aa6438d1cc24aa5dc6c20ae5d248381bf370c Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 10 Mar 2022 14:15:15 +0200 Subject: [PATCH 4/6] review fixes --- src/VecSim/vec_sim.h | 2 +- tests/unit/test_bruteforce.cpp | 6 ++++-- tests/unit/test_hnswlib.cpp | 6 ++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/VecSim/vec_sim.h b/src/VecSim/vec_sim.h index 7947da1b2..29d06b740 100644 --- a/src/VecSim/vec_sim.h +++ b/src/VecSim/vec_sim.h @@ -56,7 +56,7 @@ int VecSimIndex_DeleteVector(VecSimIndex *index, size_t id); double VecSimIndex_GetDistanceFrom(VecSimIndex *index, size_t id, const void *blob); /** - * @brief normalize the vector blob. + * @brief normalize the vector blob in place. * @param blob binary representation of a vector. Blob size should match the specified type and * dimension. * @param dim vector dimension. diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 4e67a3ca2..05fe591e0 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -971,8 +971,10 @@ TEST_F(BruteForceTest, brute_get_distance) { } void *query = v1; - void *norm = v2; - VecSim_Normalize(norm, dim, VecSimType_FLOAT32); + void *norm = v2; // {e, e} + VecSim_Normalize(norm, dim, VecSimType_FLOAT32); // now {1/sqrt(2), 1/sqrt(2)} + ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f/sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f/sqrt(2.0f)); double dist; // VecSimMetric_L2 diff --git a/tests/unit/test_hnswlib.cpp b/tests/unit/test_hnswlib.cpp index 2f4635239..97186797d 100644 --- a/tests/unit/test_hnswlib.cpp +++ b/tests/unit/test_hnswlib.cpp @@ -1140,8 +1140,10 @@ TEST_F(HNSWLibTest, hnsw_get_distance) { } void *query = v1; - void *norm = v2; - VecSim_Normalize(norm, dim, VecSimType_FLOAT32); + void *norm = v2; // {e, e} + VecSim_Normalize(norm, dim, VecSimType_FLOAT32); // now {1/sqrt(2), 1/sqrt(2)} + ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f/sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f/sqrt(2.0f)); double dist; // VecSimMetric_L2 From c1b4aac69b76c96141955a1abb7d2a691de47481 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 10 Mar 2022 14:44:08 +0200 Subject: [PATCH 5/6] format --- tests/unit/test_bruteforce.cpp | 4 ++-- tests/unit/test_hnswlib.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_bruteforce.cpp b/tests/unit/test_bruteforce.cpp index 05fe591e0..df285518c 100644 --- a/tests/unit/test_bruteforce.cpp +++ b/tests/unit/test_bruteforce.cpp @@ -973,8 +973,8 @@ TEST_F(BruteForceTest, brute_get_distance) { void *query = v1; void *norm = v2; // {e, e} VecSim_Normalize(norm, dim, VecSimType_FLOAT32); // now {1/sqrt(2), 1/sqrt(2)} - ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f/sqrt(2.0f)); - ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f/sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f / sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f / sqrt(2.0f)); double dist; // VecSimMetric_L2 diff --git a/tests/unit/test_hnswlib.cpp b/tests/unit/test_hnswlib.cpp index 97186797d..82bc6f498 100644 --- a/tests/unit/test_hnswlib.cpp +++ b/tests/unit/test_hnswlib.cpp @@ -1142,8 +1142,8 @@ TEST_F(HNSWLibTest, hnsw_get_distance) { void *query = v1; void *norm = v2; // {e, e} VecSim_Normalize(norm, dim, VecSimType_FLOAT32); // now {1/sqrt(2), 1/sqrt(2)} - ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f/sqrt(2.0f)); - ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f/sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[0], 1.0f / sqrt(2.0f)); + ASSERT_FLOAT_EQ(((float *)norm)[1], 1.0f / sqrt(2.0f)); double dist; // VecSimMetric_L2 From f69bb0f0931744b5bf15bc9e80f3f5f1114e4955 Mon Sep 17 00:00:00 2001 From: GuyAv46 Date: Thu, 10 Mar 2022 15:26:51 +0200 Subject: [PATCH 6/6] elaborated docs --- src/VecSim/vec_sim.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/VecSim/vec_sim.h b/src/VecSim/vec_sim.h index 29d06b740..791af90be 100644 --- a/src/VecSim/vec_sim.h +++ b/src/VecSim/vec_sim.h @@ -44,7 +44,9 @@ int VecSimIndex_AddVector(VecSimIndex *index, const void *blob, size_t id); int VecSimIndex_DeleteVector(VecSimIndex *index, size_t id); /** - * @brief Calculate the distance of a vector from an index to a vector. + * @brief Calculate the distance of a vector from an index to a vector. This function assumes that + * the vector fits the index - its type and dimension are the same as the index's, and if the + * index's distance metric is cosine, the vector is already normalized. * @param index the index from which the first vector is located, and that defines the distance * metric. * @param id the id of the vector in the index.