diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
new file mode 100644
index 000000000..f14c2289f
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include <arm_neon.h>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
+ *
+ * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk.
+ */
+
+// Helper: 16 lanes per call, four FP32 accumulators (one per quarter).
+static inline void SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
+                                                     float32x4_t &sum0, float32x4_t &sum1,
+                                                     float32x4_t &sum2, float32x4_t &sum3) {
+    uint8x16_t v1_u8 = vld1q_u8(pVect1);
+    // SQ8 values 0..255 are exact in FP16, so widen uint8 -> uint16 -> fp16 -> fp32.
+    // This drops two integer-widening ops per chunk versus the uint8 -> u16 -> u32 -> f32
+    // chain while producing bit-identical FP32 lane values.
+    float16x8_t v1_h_lo = vcvtq_f16_u16(vmovl_u8(vget_low_u8(v1_u8)));
+    float16x8_t v1_h_hi = vcvtq_f16_u16(vmovl_u8(vget_high_u8(v1_u8)));
+    float32x4_t v1_0 = vcvt_f32_f16(vget_low_f16(v1_h_lo));
+    float32x4_t v1_1 = vcvt_f32_f16(vget_high_f16(v1_h_lo));
+    float32x4_t v1_2 = vcvt_f32_f16(vget_low_f16(v1_h_hi));
+    float32x4_t v1_3 = vcvt_f32_f16(vget_high_f16(v1_h_hi));
+
+    const float16_t *q = reinterpret_cast<const float16_t *>(pVect2);
+    float16x8_t q_lo = vld1q_f16(q);
+    float16x8_t q_hi = vld1q_f16(q + 8);
+    float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
+    float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
+    float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
+    float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));
+
+    sum0 = vfmaq_f32(sum0, v1_0, v2_0);
+    sum1 = vfmaq_f32(sum1, v1_1, v2_1);
+    sum2 = vfmaq_f32(sum2, v1_2, v2_2);
+    sum3 = vfmaq_f32(sum3, v1_3, v2_3);
+
+    pVect1 += 16;
+    pVect2 += 16;
+}
+
+// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v,
+                                              size_t dimension) {
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3);
+    }
+
+    // Residual: up to three independent 4-lane sub-steps, leaving at most 3 elements
+    // for scalar — mirrors the SQ8_FP32 NEON sister pattern.
+    // vld1_f16 (4 FP16 = 8 bytes) is safe for any residual: FP16 metadata follows
+    // the lane data so there is always enough headroom.
+    constexpr unsigned char r = residual;
+    if constexpr (r >= 4) {
+        uint8x8_t v1_u8 = vld1_u8(pVect1);
+        float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
+        float32x4_t v2_a = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
+        sum0 = vfmaq_f32(sum0, v1_a, v2_a);
+        pVect1 += 4;
+        pVect2 += 4;
+    }
+    if constexpr (r >= 8) {
+        uint8x8_t v1_u8 = vld1_u8(pVect1);
+        float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
+        float32x4_t v2_b = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
+        sum1 = vfmaq_f32(sum1, v1_b, v2_b);
+        pVect1 += 4;
+        pVect2 += 4;
+    }
+    if constexpr (r >= 12) {
+        uint8x8_t v1_u8 = vld1_u8(pVect1);
+        float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
+        float32x4_t v2_c = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
+        sum2 = vfmaq_f32(sum2, v1_c, v2_c);
+        pVect1 += 4;
+        pVect2 += 4;
+    }
+    constexpr unsigned char tail = r & 3;
+    float scalar_dot = 0.0f;
+    for (unsigned char k = 0; k < tail; ++k) {
+        scalar_dot += static_cast<float>(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]);
+    }
+
+    float32x4_t sum_lo = vaddq_f32(sum0, sum1);
+    float32x4_t sum_hi = vaddq_f32(sum2, sum3);
+    float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual>
+float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v,
+                                          size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual>
+float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD16_NEON_HP<residual>(pVect1v, pVect2v, dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h
new file mode 100644
index 000000000..a36627e80
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include <arm_sve.h>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * SVE2 asymmetric SQ8 (storage) <-> FP16 (query) inner product using the identity:
+ *   IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
+ *
+ * SVE2-only fast path: the storage bytes (0..255, exact in FP16) and the FP16 query
+ * lanes stay 16-bit, and the FP16->FP32 widening multiply-accumulate is done by the
+ * FMLALB/FMLALT pair (svmlalb_f32 / svmlalt_f32). Each pair widens the even/odd
+ * half-precision lanes to single precision and multiplies/accumulates in FP32 WITHOUT
+ * intermediate rounding, so the per-lane products match the SVE svmla path exactly while
+ * processing svcnth() lanes per step (twice the base-SVE svcntw() granularity) and halving
+ * the number of loads and explicit conversions. The even/odd accumulator split groups the
+ * FP32 additions differently than the base SVE kernel, so the reduced result is numerically
+ * equivalent (well within the test tolerance) rather than bit-identical.
+ */
+
+// Helper: one svcnth()-wide FP16 step feeding an even/odd FP32 accumulator pair.
+static inline void SQ8_FP16_InnerProductStep_SVE2(const uint8_t *pVect1, const float16 *pVect2,
+                                                  size_t &offset, svfloat32_t &sum_even,
+                                                  svfloat32_t &sum_odd, svbool_t pg16,
+                                                  size_t chunk) {
+    svuint16_t v1_u16 = svld1ub_u16(pg16, pVect1 + offset);
+    svfloat16_t v1_f16 = svcvt_f16_u16_x(pg16, v1_u16);
+    svfloat16_t q_f16 = svld1_f16(pg16, reinterpret_cast<const float16_t *>(pVect2 + offset));
+    // FMLALB/FMLALT are unpredicated; inactive lanes were zeroed by the loads above so
+    // their contribution is 0 and walking all lanes is safe.
+    sum_even = svmlalb_f32(sum_even, v1_f16, q_f16);
+    sum_odd = svmlalt_f32(sum_odd, v1_f16, q_f16);
+    offset += chunk;
+}
+
+// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE2_IMP(const void *pVect1v, const void *pVect2v,
+                                         size_t dimension) {
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    size_t offset = 0;
+    const svbool_t pg16 = svptrue_b16();
+    const size_t chunk = svcnth();
+
+    svfloat32_t sum0e = svdup_f32(0.0f), sum0o = svdup_f32(0.0f);
+    svfloat32_t sum1e = svdup_f32(0.0f), sum1o = svdup_f32(0.0f);
+    svfloat32_t sum2e = svdup_f32(0.0f), sum2o = svdup_f32(0.0f);
+    svfloat32_t sum3e = svdup_f32(0.0f), sum3o = svdup_f32(0.0f);
+
+    // Partial chunk for dim % chunk FP16 lanes. Zeroing loads (_z convert) leave inactive
+    // lanes at 0 so the unpredicated FMLALB/FMLALT below ignore them.
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % chunk;
+        if (remaining > 0) {
+            svbool_t pg_partial = svwhilelt_b16(uint64_t(0), uint64_t(remaining));
+            svuint16_t v1_u16 = svld1ub_u16(pg_partial, pVect1 + offset);
+            svfloat16_t v1_f16 = svcvt_f16_u16_z(pg_partial, v1_u16);
+            svfloat16_t q_f16 =
+                svld1_f16(pg_partial, reinterpret_cast<const float16_t *>(pVect2 + offset));
+            sum0e = svmlalb_f32(sum0e, v1_f16, q_f16);
+            sum0o = svmlalt_f32(sum0o, v1_f16, q_f16);
+            offset += remaining;
+        }
+    }
+
+    // Main loop: 4 steps per iteration, one even/odd accumulator pair per step.
+    const size_t chunk_size = 4 * chunk;
+    const size_t number_of_chunks =
+        (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum0e, sum0o, pg16, chunk);
+        SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum1e, sum1o, pg16, chunk);
+        SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum2e, sum2o, pg16, chunk);
+        SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum3e, sum3o, pg16, chunk);
+    }
+
+    if constexpr (additional_steps > 0)
+        SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum0e, sum0o, pg16, chunk);
+    if constexpr (additional_steps > 1)
+        SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum1e, sum1o, pg16, chunk);
+    if constexpr (additional_steps > 2)
+        SQ8_FP16_InnerProductStep_SVE2(pVect1, pVect2, offset, sum2e, sum2o, pg16, chunk);
+
+    const svbool_t pg32 = svptrue_b32();
+    svfloat32_t sum = svadd_f32_z(pg32, sum0e, sum0o);
+    sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum1e, sum1o));
+    sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum2e, sum2o));
+    sum = svadd_f32_x(pg32, sum, svadd_f32_x(pg32, sum3e, sum3o));
+    float quantized_dot = svaddv_f32(pg32, sum);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductSIMD_SVE2_IMP<partial_chunk, additional_steps>(
+                      pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_CosineSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD_SVE2<partial_chunk, additional_steps>(pVect1v, pVect2v,
+                                                                           dimension);
+}
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
new file mode 100644
index 000000000..1408e0880
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include <arm_sve.h>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
+ *
+ * FP16 query lanes are widened to FP32 per step via svld1uh_u32 + svcvt_f32_f16_x.
+ * svld1uh_u32 zero-extends each FP16 halfword into a 32-bit lane so that
+ * svcvt_f32_f16_x reads the correct bits directly without any interleaving.
+ */
+
+// Helper: one SVE-vector-width-of-FP32 step.
+static inline void SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2,
+                                                 size_t &offset, svfloat32_t &sum, svbool_t pg,
+                                                 size_t chunk) {
+    svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
+    svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
+    svuint32_t q_u32 = svld1uh_u32(pg, reinterpret_cast<const uint16_t *>(pVect2 + offset));
+    svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32));
+    sum = svmla_f32_x(pg, sum, v1_f, v2_f);
+    offset += chunk;
+}
+
+// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v,
+                                        size_t dimension) {
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    size_t offset = 0;
+    svbool_t pg = svptrue_b32();
+    const size_t chunk = svcntw();
+
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero;
+    // the final reduction walks all lanes via svptrue_b32().
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % chunk;
+        if (remaining > 0) {
+            svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining));
+            svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
+            svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
+            svuint32_t q_u32 =
+                svld1uh_u32(pg_partial, reinterpret_cast<const uint16_t *>(pVect2 + offset));
+            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32));
+            sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
+            offset += remaining;
+        }
+    }
+
+    // Main loop: 4 chunks per iteration, one chunk per accumulator.
+    const size_t chunk_size = 4 * chunk;
+    const size_t number_of_chunks =
+        (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk);
+    }
+
+    if constexpr (additional_steps > 0)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
+    if constexpr (additional_steps > 1)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
+    if constexpr (additional_steps > 2)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
+
+    svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
+    sum = svadd_f32_z(pg, sum, sum2);
+    sum = svadd_f32_z(pg, sum, sum3);
+    float quantized_dot = svaddv_f32(pg, sum);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
+                      pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(pVect1v, pVect2v,
+                                                                          dimension);
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index b57971b60..9366d3144 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -225,6 +225,26 @@ dist_func_t<float> IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
 #endif
 #endif // OPT_F16C
 #endif // x86_64
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_FP16_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_FP16_IP_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) {
+        return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
     return ret_dist_func;
 }
 
@@ -274,6 +294,26 @@ dist_func_t<float> Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm
 #endif
 #endif // OPT_F16C
 #endif // x86_64
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_FP16_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_FP16_Cosine_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) {
+        return Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
     return ret_dist_func;
 }
 
diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
new file mode 100644
index 000000000..70367d7fe
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
+
+/*
+ * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity:
+ *
+ *   ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2)
+ *               = x_sum_squares - 2 * IP(x, y) + y_sum_squares
+ *
+ * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32.
+ */
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
diff --git a/src/VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h
new file mode 100644
index 000000000..d9451fe2a
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h"
+
+/*
+ * SVE2 SQ8<->FP16 L2 squared distance:
+ *   ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares
+ * IP is computed by SQ8_FP16_InnerProductSIMD_SVE2_IMP; metadata is FP32.
+ */
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_L2SqrSIMD_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductSIMD_SVE2_IMP<partial_chunk, additional_steps>(
+        pVect1v, pVect2v, dimension);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
new file mode 100644
index 000000000..f70ef493d
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
+
+/*
+ * SVE SQ8<->FP16 L2 squared distance:
+ *   ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares
+ * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32.
+ */
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
+        pVect1v, pVect2v, dimension);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 43020399f..7d65814e0 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -156,6 +156,26 @@ dist_func_t<float> L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
 #endif
 #endif // OPT_F16C
 #endif // x86_64
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_FP16_L2_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_FP16_L2_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) {
+        return Choose_SQ8_FP16_L2_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
     return ret_dist_func;
 }
 
diff --git a/src/VecSim/spaces/functions/NEON_HP.cpp b/src/VecSim/spaces/functions/NEON_HP.cpp
index 2dea94934..20d93a517 100644
--- a/src/VecSim/spaces/functions/NEON_HP.cpp
+++ b/src/VecSim/spaces/functions/NEON_HP.cpp
@@ -10,6 +10,8 @@
 
 #include "VecSim/spaces/L2/L2_NEON_FP16.h"
 #include "VecSim/spaces/IP/IP_NEON_FP16.h"
+#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h"
 
 namespace spaces {
 
@@ -27,6 +29,24 @@ dist_func_t<float> Choose_FP16_IP_implementation_NEON_HP(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/NEON_HP.h b/src/VecSim/spaces/functions/NEON_HP.h
index c65bd6948..889eb0919 100644
--- a/src/VecSim/spaces/functions/NEON_HP.h
+++ b/src/VecSim/spaces/functions/NEON_HP.h
@@ -16,4 +16,8 @@ dist_func_t<float> Choose_FP16_IP_implementation_NEON_HP(size_t dim);
 
 dist_func_t<float> Choose_FP16_L2_implementation_NEON_HP(size_t dim);
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp
index fde853db2..bd197c84c 100644
--- a/src/VecSim/spaces/functions/SVE.cpp
+++ b/src/VecSim/spaces/functions/SVE.cpp
@@ -25,6 +25,9 @@
 #include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h"
 #include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h"
 
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h"
+
 #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h"
 #include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h"
 
@@ -119,6 +122,24 @@ dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum)
 // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) {
diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h
index bd3bc97c3..43b3b22cd 100644
--- a/src/VecSim/spaces/functions/SVE.h
+++ b/src/VecSim/spaces/functions/SVE.h
@@ -33,6 +33,10 @@ dist_func_t<float> Choose_SQ8_FP32_IP_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_Cosine_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE(size_t dim);
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim);
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim);
diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp
index 4215d79cf..9eea81523 100644
--- a/src/VecSim/spaces/functions/SVE2.cpp
+++ b/src/VecSim/spaces/functions/SVE2.cpp
@@ -16,14 +16,16 @@
 
 #include "VecSim/spaces/IP/IP_SVE_FP64.h"
 #include "VecSim/spaces/L2/L2_SVE_FP64.h"
-#include "VecSim/spaces/L2/L2_SVE_INT8.h"     // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/IP/IP_SVE_INT8.h"     // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/L2/L2_SVE_UINT8.h"    // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/IP/IP_SVE_UINT8.h"    // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h"  // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h"  // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_INT8.h"      // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_INT8.h"      // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_UINT8.h"     // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_UINT8.h"     // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h"  // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h"  // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE2_SQ8_FP16.h" // SVE2 fast path: FMLALB/FMLALT widening
+#include "VecSim/spaces/L2/L2_SVE2_SQ8_FP16.h" // SVE2 fast path: FMLALB/FMLALT widening
+#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h"   // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h"   // SVE2 implementation is identical to SVE
 
 namespace spaces {
 
@@ -116,6 +118,24 @@ dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE2, dim, svcnth);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE2, dim, svcnth);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE2, dim, svcnth);
+    return ret_dist_func;
+}
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized)
 // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim) {
diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h
index 04078a91e..2c1bfbac3 100644
--- a/src/VecSim/spaces/functions/SVE2.h
+++ b/src/VecSim/spaces/functions/SVE2.h
@@ -33,6 +33,10 @@ dist_func_t<float> Choose_SQ8_FP32_IP_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_Cosine_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim);
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim);
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim);
diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh
index 91ba49448..115a4cac9 100755
--- a/tests/benchmark/benchmarks.sh
+++ b/tests/benchmark/benchmarks.sh
@@ -21,6 +21,7 @@ if [ -z "$BM_TYPE"  ] || [ "$BM_TYPE" = "benchmarks-all" ]; then
     echo spaces_int8
     echo spaces_uint8
     echo spaces_sq8_fp32
+    echo spaces_sq8_fp16
     echo spaces_sq8_sq8
 
 elif [ "$BM_TYPE" = "benchmarks-default" ]; then
@@ -33,6 +34,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then
     echo spaces_int8
     echo spaces_uint8
     echo spaces_sq8_fp32
+    echo spaces_sq8_fp16
     echo spaces_sq8_sq8
 
 
@@ -106,6 +108,7 @@ elif [ "$BM_TYPE" = "bm-basics-svs-fp32-single" ] ; then
     echo basics_svs_single_fp32_LVQ8
 elif [ "$BM_TYPE" = "bm-spaces-sq8-full" ] ; then
     echo spaces_sq8_fp32
+    echo spaces_sq8_fp16
     echo spaces_sq8_sq8
 
 
@@ -118,6 +121,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then
     echo spaces_int8
     echo spaces_uint8
     echo spaces_sq8_fp32
+    echo spaces_sq8_fp16
     echo spaces_sq8_sq8
 
 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then
@@ -134,6 +138,8 @@ elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then
     echo spaces_uint8
 elif [ "$BM_TYPE" = "bm-spaces-sq8-fp32" ] ; then
     echo spaces_sq8_fp32
+elif [ "$BM_TYPE" = "bm-spaces-sq8-fp16" ] ; then
+    echo spaces_sq8_fp16
 elif [ "$BM_TYPE" = "bm-spaces-sq8-sq8" ] ; then
     echo spaces_sq8_sq8
 fi
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
index ba3030064..cc5d040cb 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
@@ -16,8 +16,8 @@ using float16 = vecsim_types::float16;
 /**
  * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query.
  * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA /
- * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels
- * land via MOD-14972.
+ * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP
+ * / SVE / SVE2) are registered below.
  */
 class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture {
 protected:
@@ -85,6 +85,29 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, s
 #endif // OPT_F16C
 #endif // x86_64
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features;
+
+#ifdef OPT_SVE2
+bool sve2_supported = arm_opt.sve2;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+#endif
+
+#ifdef OPT_SVE
+bool sve_supported = arm_opt.sve;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+#endif
+
+#ifdef OPT_NEON_HP
+bool neon_hp_supported = arm_opt.asimdhp;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16,
+                                 neon_hp_supported);
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+
 // Naive (scalar) baseline — always registered as the comparison anchor.
 
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, InnerProduct, 16);
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 474ac5c75..ce8605565 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -3149,6 +3149,45 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+
     unsigned char alignment = 0;
     arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, SQ8_FP16_L2Sqr)
@@ -3224,6 +3263,45 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+
     unsigned char alignment = 0;
     arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, SQ8_FP16_InnerProduct)
@@ -3299,6 +3377,45 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+
     unsigned char alignment = 0;
     arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, SQ8_FP16_Cosine)