-
Notifications
You must be signed in to change notification settings - Fork 22
[0.7] Beng arm opt f64 - [MOD-9077] #647
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
c58818e
cherry pick for arm opt fp64
BenGoldberger 8a523f6
small fix
BenGoldberger 030e4fd
remove unwanted changes
BenGoldberger 6872e17
add benchmarks for arm
BenGoldberger ca44c11
format
BenGoldberger 572867c
remove json files
BenGoldberger 8453568
pr changes
BenGoldberger File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| /* | ||
| *Copyright Redis Ltd. 2021 - present | ||
| *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or | ||
| *the Server Side Public License v1 (SSPLv1). | ||
| */ | ||
|
|
||
| #include "VecSim/spaces/space_includes.h" | ||
| #include <arm_neon.h> | ||
|
|
||
| inline void InnerProductStep(double *&pVect1, double *&pVect2, float64x2_t &sum) { | ||
| float64x2_t v1 = vld1q_f64(pVect1); | ||
| float64x2_t v2 = vld1q_f64(pVect2); | ||
| sum = vmlaq_f64(sum, v1, v2); | ||
| pVect1 += 2; | ||
| pVect2 += 2; | ||
| } | ||
|
|
||
| template <unsigned char residual> // 0..7 | ||
| double FP64_InnerProductSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { | ||
| double *pVect1 = (double *)pVect1v; | ||
| double *pVect2 = (double *)pVect2v; | ||
|
|
||
| float64x2_t sum0 = vdupq_n_f64(0.0); | ||
| float64x2_t sum1 = vdupq_n_f64(0.0); | ||
| float64x2_t sum2 = vdupq_n_f64(0.0); | ||
| float64x2_t sum3 = vdupq_n_f64(0.0); | ||
|
|
||
| const size_t num_of_chunks = dimension / 8; | ||
|
|
||
| for (size_t i = 0; i < num_of_chunks; i++) { | ||
| InnerProductStep(pVect1, pVect2, sum0); | ||
| InnerProductStep(pVect1, pVect2, sum1); | ||
| InnerProductStep(pVect1, pVect2, sum2); | ||
| InnerProductStep(pVect1, pVect2, sum3); | ||
| } | ||
|
|
||
| // Handle remaining complete 2-float blocks within residual | ||
| constexpr size_t remaining_chunks = residual / 2; | ||
| // Unrolled loop for the 2-float blocks | ||
| if constexpr (remaining_chunks >= 1) { | ||
| InnerProductStep(pVect1, pVect2, sum0); | ||
| } | ||
| if constexpr (remaining_chunks >= 2) { | ||
| InnerProductStep(pVect1, pVect2, sum1); | ||
| } | ||
| if constexpr (remaining_chunks >= 3) { | ||
| InnerProductStep(pVect1, pVect2, sum2); | ||
| } | ||
|
|
||
| // Handle final residual elements (0-1 elements) | ||
| // This entire block is eliminated at compile time if final_residual is 0 | ||
| constexpr size_t final_residual = residual % 2; // Final 0-1 elements | ||
| if constexpr (final_residual == 1) { | ||
| float64x2_t v1 = vdupq_n_f64(0.0); | ||
| float64x2_t v2 = vdupq_n_f64(0.0); | ||
| v1 = vld1q_lane_f64(pVect1, v1, 0); | ||
| v2 = vld1q_lane_f64(pVect2, v2, 0); | ||
|
|
||
| sum3 = vmlaq_f64(sum3, v1, v2); | ||
| } | ||
|
|
||
| float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3)); | ||
|
|
||
| // Horizontal sum of the 4 elements in the NEON register | ||
| float64x1_t summed = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined)); | ||
| double sum = vget_lane_f64(summed, 0); | ||
|
|
||
| return 1.0 - sum; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| /* | ||
| *Copyright Redis Ltd. 2021 - present | ||
| *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or | ||
| *the Server Side Public License v1 (SSPLv1). | ||
| */ | ||
|
|
||
| #include "VecSim/spaces/space_includes.h" | ||
|
|
||
| #include <arm_sve.h> | ||
|
|
||
| inline void InnerProductStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum, | ||
| const size_t chunk) { | ||
| // Load vectors | ||
| svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset); | ||
| svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset); | ||
|
|
||
| // Multiply-accumulate | ||
| sum = svmla_f64_x(svptrue_b64(), sum, v1, v2); | ||
|
|
||
| // Advance pointers | ||
| offset += chunk; | ||
| } | ||
|
|
||
| template <bool partial_chunk, unsigned char additional_steps> | ||
| double FP64_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { | ||
| double *pVect1 = (double *)pVect1v; | ||
| double *pVect2 = (double *)pVect2v; | ||
| const size_t chunk = svcntd(); | ||
| size_t offset = 0; | ||
|
|
||
| // Multiple accumulators to increase instruction-level parallelism | ||
| svfloat64_t sum0 = svdup_f64(0.0); | ||
| svfloat64_t sum1 = svdup_f64(0.0); | ||
| svfloat64_t sum2 = svdup_f64(0.0); | ||
| svfloat64_t sum3 = svdup_f64(0.0); | ||
|
|
||
| auto chunk_size = 4 * chunk; | ||
| size_t number_of_chunks = dimension / chunk_size; | ||
| for (size_t i = 0; i < number_of_chunks; i++) { | ||
| InnerProductStep(pVect1, pVect2, offset, sum0, chunk); | ||
| InnerProductStep(pVect1, pVect2, offset, sum1, chunk); | ||
| InnerProductStep(pVect1, pVect2, offset, sum2, chunk); | ||
| InnerProductStep(pVect1, pVect2, offset, sum3, chunk); | ||
| } | ||
|
|
||
| if constexpr (additional_steps >= 1) { | ||
| InnerProductStep(pVect1, pVect2, offset, sum0, chunk); | ||
| } | ||
| if constexpr (additional_steps >= 2) { | ||
| InnerProductStep(pVect1, pVect2, offset, sum1, chunk); | ||
| } | ||
| if constexpr (additional_steps >= 3) { | ||
| InnerProductStep(pVect1, pVect2, offset, sum2, chunk); | ||
| } | ||
|
|
||
| if constexpr (partial_chunk) { | ||
| svbool_t pg = | ||
| svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension)); | ||
| svfloat64_t v1 = svld1_f64(pg, pVect1 + offset); | ||
| svfloat64_t v2 = svld1_f64(pg, pVect2 + offset); | ||
| sum3 = svmla_f64_m(pg, sum3, v1, v2); | ||
| } | ||
|
|
||
| // Combine the partial sums | ||
| sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1); | ||
| sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3); | ||
|
|
||
| // Perform vector addition in parallel | ||
| svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2); | ||
| // Single horizontal reduction at the end | ||
| double result = svaddv_f64(svptrue_b64(), sum_all); | ||
| return 1.0 - result; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,76 @@ | ||
| /* | ||
| *Copyright Redis Ltd. 2021 - present | ||
| *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or | ||
| *the Server Side Public License v1 (SSPLv1). | ||
| */ | ||
|
|
||
| #include "VecSim/spaces/space_includes.h" | ||
| #include <arm_neon.h> | ||
|
|
||
| inline void L2SquareStep(double *&pVect1, double *&pVect2, float64x2_t &sum) { | ||
| float64x2_t v1 = vld1q_f64(pVect1); | ||
| float64x2_t v2 = vld1q_f64(pVect2); | ||
|
|
||
| // Calculate difference between vectors | ||
| float64x2_t diff = vsubq_f64(v1, v2); | ||
|
|
||
| // Square and accumulate | ||
| sum = vmlaq_f64(sum, diff, diff); | ||
|
|
||
| pVect1 += 2; | ||
| pVect2 += 2; | ||
| } | ||
|
|
||
| template <unsigned char residual> // 0..7 | ||
| double FP64_L2SqrSIMD8_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { | ||
| double *pVect1 = (double *)pVect1v; | ||
| double *pVect2 = (double *)pVect2v; | ||
|
|
||
| float64x2_t sum0 = vdupq_n_f64(0.0); | ||
| float64x2_t sum1 = vdupq_n_f64(0.0); | ||
| float64x2_t sum2 = vdupq_n_f64(0.0); | ||
| float64x2_t sum3 = vdupq_n_f64(0.0); | ||
| // These are compile-time constants derived from the template parameter | ||
|
|
||
| // Calculate how many full 8-element blocks to process | ||
| const size_t num_of_chunks = dimension / 8; | ||
|
|
||
| for (size_t i = 0; i < num_of_chunks; i++) { | ||
| L2SquareStep(pVect1, pVect2, sum0); | ||
| L2SquareStep(pVect1, pVect2, sum1); | ||
| L2SquareStep(pVect1, pVect2, sum2); | ||
| L2SquareStep(pVect1, pVect2, sum3); | ||
| } | ||
|
|
||
| // Handle remaining complete 2-float blocks within residual | ||
| constexpr size_t remaining_chunks = residual / 2; | ||
| // Unrolled loop for the 2-float blocks | ||
| if constexpr (remaining_chunks >= 1) { | ||
| L2SquareStep(pVect1, pVect2, sum0); | ||
| } | ||
| if constexpr (remaining_chunks >= 2) { | ||
| L2SquareStep(pVect1, pVect2, sum1); | ||
| } | ||
| if constexpr (remaining_chunks >= 3) { | ||
| L2SquareStep(pVect1, pVect2, sum2); | ||
| } | ||
|
|
||
| // Handle final residual element | ||
| constexpr size_t final_residual = residual % 2; // Final element | ||
| if constexpr (final_residual > 0) { | ||
| float64x2_t v1 = vdupq_n_f64(0.0); | ||
| float64x2_t v2 = vdupq_n_f64(0.0); | ||
| v1 = vld1q_lane_f64(pVect1, v1, 0); | ||
| v2 = vld1q_lane_f64(pVect2, v2, 0); | ||
|
|
||
| // Calculate difference and square | ||
| float64x2_t diff = vsubq_f64(v1, v2); | ||
| sum3 = vmlaq_f64(sum3, diff, diff); | ||
| } | ||
|
|
||
| float64x2_t sum_combined = vaddq_f64(vaddq_f64(sum0, sum1), vaddq_f64(sum2, sum3)); | ||
|
|
||
| // Horizontal sum of the 4 elements in the NEON register | ||
| float64x1_t sum = vadd_f64(vget_low_f64(sum_combined), vget_high_f64(sum_combined)); | ||
| return vget_lane_f64(sum, 0); | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| /* | ||
| *Copyright Redis Ltd. 2021 - present | ||
| *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or | ||
| *the Server Side Public License v1 (SSPLv1). | ||
| */ | ||
|
|
||
| #include "VecSim/spaces/space_includes.h" | ||
| #include <arm_sve.h> | ||
|
|
||
| inline void L2SquareStep(double *&pVect1, double *&pVect2, size_t &offset, svfloat64_t &sum, | ||
| const size_t chunk) { | ||
| // Load vectors | ||
| svfloat64_t v1 = svld1_f64(svptrue_b64(), pVect1 + offset); | ||
| svfloat64_t v2 = svld1_f64(svptrue_b64(), pVect2 + offset); | ||
|
|
||
| // Calculate difference between vectors | ||
| svfloat64_t diff = svsub_f64_x(svptrue_b64(), v1, v2); | ||
|
|
||
| // Square the difference and accumulate: sum += diff * diff | ||
| sum = svmla_f64_x(svptrue_b64(), sum, diff, diff); | ||
|
|
||
| // Advance pointers by the vector length | ||
| offset += chunk; | ||
| } | ||
|
|
||
| template <bool partial_chunk, unsigned char additional_steps> | ||
| double FP64_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { | ||
| double *pVect1 = (double *)pVect1v; | ||
| double *pVect2 = (double *)pVect2v; | ||
| const size_t chunk = svcntd(); | ||
| size_t offset = 0; | ||
|
|
||
| // Multiple accumulators to increase instruction-level parallelism | ||
| svfloat64_t sum0 = svdup_f64(0.0); | ||
| svfloat64_t sum1 = svdup_f64(0.0); | ||
| svfloat64_t sum2 = svdup_f64(0.0); | ||
| svfloat64_t sum3 = svdup_f64(0.0); | ||
|
|
||
| // Process vectors in chunks, with unrolling for better pipelining | ||
| auto chunk_size = 4 * chunk; | ||
| size_t number_of_chunks = dimension / chunk_size; | ||
| for (size_t i = 0; i < number_of_chunks; ++i) { | ||
| // Process 4 chunks with separate accumulators | ||
| L2SquareStep(pVect1, pVect2, offset, sum0, chunk); | ||
| L2SquareStep(pVect1, pVect2, offset, sum1, chunk); | ||
| L2SquareStep(pVect1, pVect2, offset, sum2, chunk); | ||
| L2SquareStep(pVect1, pVect2, offset, sum3, chunk); | ||
| } | ||
|
|
||
| if constexpr (additional_steps >= 1) { | ||
| L2SquareStep(pVect1, pVect2, offset, sum0, chunk); | ||
| } | ||
| if constexpr (additional_steps >= 2) { | ||
| L2SquareStep(pVect1, pVect2, offset, sum1, chunk); | ||
| } | ||
| if constexpr (additional_steps >= 3) { | ||
| L2SquareStep(pVect1, pVect2, offset, sum2, chunk); | ||
| } | ||
|
|
||
| if constexpr (partial_chunk) { | ||
| svbool_t pg = | ||
| svwhilelt_b64(static_cast<uint64_t>(offset), static_cast<uint64_t>(dimension)); | ||
|
|
||
| // Load vectors with predication | ||
| svfloat64_t v1 = svld1_f64(pg, pVect1 + offset); | ||
| svfloat64_t v2 = svld1_f64(pg, pVect2 + offset); | ||
|
|
||
| // Calculate difference with predication (corrected) | ||
| svfloat64_t diff = svsub_f64_x(pg, v1, v2); | ||
|
|
||
| // Square the difference and accumulate with predication | ||
| sum3 = svmla_f64_m(pg, sum3, diff, diff); | ||
| } | ||
|
|
||
| // Combine the partial sums | ||
| sum0 = svadd_f64_x(svptrue_b64(), sum0, sum1); | ||
| sum2 = svadd_f64_x(svptrue_b64(), sum2, sum3); | ||
| svfloat64_t sum_all = svadd_f64_x(svptrue_b64(), sum0, sum2); | ||
| double result = svaddv_f64(svptrue_b64(), sum_all); | ||
| return result; | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.