From 53d7452cdfa7bd7ffa74afda51a482b0dca990dd Mon Sep 17 00:00:00 2001 From: yuanjia Date: Fri, 12 Sep 2025 09:45:06 +0800 Subject: [PATCH 1/2] riscv: gemv_t_vector.c optimize --- kernel/riscv64/gemv_t_vector.c | 127 +++++---------------------------- 1 file changed, 19 insertions(+), 108 deletions(-) diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index 0c9dad1e4b..d6562cb70f 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -72,123 +72,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T_M1 v_res; size_t vlmax = VSETVL_MAX_M1(); -#ifndef RISCV_0p10_INTRINSICS - FLOAT_V_T va0, va1, va2, va3, vr0, vr1, vr2, vr3; - FLOAT_V_T_M1 vec0, vec1, vec2, vec3; - FLOAT *a_ptrs[4], *y_ptrs[4]; -#endif - if(inc_x == 1){ -#ifndef RISCV_0p10_INTRINSICS - BLASLONG anr = n - n % 4; - for (; i < anr; i += 4) { - gvl = VSETVL(m); - j = 0; - for (int l = 0; l < 4; l++) { - a_ptrs[l] = a + (i + l) * lda; - y_ptrs[l] = y + (i + l) * inc_y; - } - vec0 = VFMVVF_FLOAT_M1(0.0, vlmax); - vec1 = VFMVVF_FLOAT_M1(0.0, vlmax); - vec2 = VFMVVF_FLOAT_M1(0.0, vlmax); - vec3 = VFMVVF_FLOAT_M1(0.0, vlmax); - vr0 = VFMVVF_FLOAT(0.0, gvl); - vr1 = VFMVVF_FLOAT(0.0, gvl); - vr2 = VFMVVF_FLOAT(0.0, gvl); - vr3 = VFMVVF_FLOAT(0.0, gvl); - for (k = 0; k < m / gvl; k++) { - va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); - va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl); - va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl); - va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl); - vx = VLEV_FLOAT(x + j, gvl); - vr0 = VFMULVV_FLOAT(va0, vx, gvl); - vr1 = VFMULVV_FLOAT(va1, vx, gvl); - vr2 = VFMULVV_FLOAT(va2, vx, gvl); - vr3 = VFMULVV_FLOAT(va3, vx, gvl); - // Floating-point addition does not satisfy the associative law, that is, (a + b) + c ≠ a + (b + c), - // so piecewise multiplication and reduction must be performed inside the loop body. - vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl); - vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl); - vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl); - vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl); - j += gvl; - } - if (j < m) { - gvl = VSETVL(m - j); - va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); - va1 = VLEV_FLOAT(a_ptrs[1] + j, gvl); - va2 = VLEV_FLOAT(a_ptrs[2] + j, gvl); - va3 = VLEV_FLOAT(a_ptrs[3] + j, gvl); - - vx = VLEV_FLOAT(x + j, gvl); - vr0 = VFMULVV_FLOAT(va0, vx, gvl); - vr1 = VFMULVV_FLOAT(va1, vx, gvl); - vr2 = VFMULVV_FLOAT(va2, vx, gvl); - vr3 = VFMULVV_FLOAT(va3, vx, gvl); - vec0 = VFREDSUM_FLOAT(vr0, vec0, gvl); - vec1 = VFREDSUM_FLOAT(vr1, vec1, gvl); - vec2 = VFREDSUM_FLOAT(vr2, vec2, gvl); - vec3 = VFREDSUM_FLOAT(vr3, vec3, gvl); - } - *y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(vec0)); - *y_ptrs[1] += alpha * (FLOAT)(EXTRACT_FLOAT(vec1)); - *y_ptrs[2] += alpha * (FLOAT)(EXTRACT_FLOAT(vec2)); - *y_ptrs[3] += alpha * (FLOAT)(EXTRACT_FLOAT(vec3)); - } - // deal with the tail - for (; i < n; i++) { - v_res = VFMVVF_FLOAT_M1(0, vlmax); + for(i = 0; i < n; i++){ + v_res = VFMVVF_FLOAT_M1(0, 1); gvl = VSETVL(m); j = 0; - a_ptrs[0] = a + i * lda; - y_ptrs[0] = y + i * inc_y; - vr0 = VFMVVF_FLOAT(0, gvl); - for (k = 0; k < m / gvl; k++) { - va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); - vx = VLEV_FLOAT(x + j, gvl); - vr0 = VFMULVV_FLOAT(va0, vx, gvl); - v_res = VFREDSUM_FLOAT(vr0, v_res, gvl); + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < m/gvl; k++){ + va = VLEV_FLOAT(&a_ptr[j], gvl); + vx = VLEV_FLOAT(&x[j], gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail j += gvl; } - if (j < m) { - gvl = VSETVL(m - j); - va0 = VLEV_FLOAT(a_ptrs[0] + j, gvl); - vx = VLEV_FLOAT(x + j, gvl); - vr0 = VFMULVV_FLOAT(va0, vx, gvl); - v_res = VFREDSUM_FLOAT(vr0, v_res, gvl); + if(j < m){ + gvl = VSETVL(m-j); + va = VLEV_FLOAT(&a_ptr[j], gvl); + vx = VLEV_FLOAT(&x[j], gvl); + vr = VFMULVV_FLOAT(va, vx, gvl); + v_res = VFREDSUM_FLOAT(vr, v_res, gvl); } - *y_ptrs[0] += alpha * (FLOAT)(EXTRACT_FLOAT(v_res)); - } -#else - for(i = 0; i < n; i++){ - v_res = VFMVVF_FLOAT_M1(0, 1); - gvl = VSETVL(m); - j = 0; - vr = VFMVVF_FLOAT(0, gvl); - for(k = 0; k < m/gvl; k++){ - va = VLEV_FLOAT(&a_ptr[j], gvl); - vx = VLEV_FLOAT(&x[j], gvl); - vr = VFMULVV_FLOAT(va, vx, gvl); // could vfmacc here and reduce outside loop - v_res = VFREDSUM_FLOAT(vr, v_res, gvl); // but that reordering diverges far enough from scalar path to make tests fail - j += gvl; - } - if(j < m){ - gvl = VSETVL(m-j); - va = VLEV_FLOAT(&a_ptr[j], gvl); - vx = VLEV_FLOAT(&x[j], gvl); - vr = VFMULVV_FLOAT(va, vx, gvl); - v_res = VFREDSUM_FLOAT(vr, v_res, gvl); - } - temp = (FLOAT)EXTRACT_FLOAT(v_res); - y[iy] += alpha * temp; + temp = (FLOAT)EXTRACT_FLOAT(v_res); + y[iy] += alpha * temp; - iy += inc_y; - a_ptr += lda; - } -#endif + iy += inc_y; + a_ptr += lda; + } } else { BLASLONG stride_x = inc_x * sizeof(FLOAT); for(i = 0; i < n; i++){ From 826cb4588fe4dbc306e3be2a9ac7e0e5fdd6e332 Mon Sep 17 00:00:00 2001 From: yuanjia Date: Sat, 13 Sep 2025 11:35:49 +0800 Subject: [PATCH 2/2] remove unused variable --- kernel/riscv64/gemv_t_vector.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c index d6562cb70f..1bdd57b6d1 100644 --- a/kernel/riscv64/gemv_t_vector.c +++ b/kernel/riscv64/gemv_t_vector.c @@ -28,7 +28,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) #define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n) -#define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e32m1) #define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T_M1 vfloat32m1_t #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8) @@ -44,7 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define xint_t int #else #define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n) -#define VSETVL_MAX_M1 RISCV_RVV(vsetvlmax_e64m1) #define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T_M1 vfloat64m1_t #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8) @@ -70,7 +68,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va, vr, vx; unsigned int gvl = 0; FLOAT_V_T_M1 v_res; - size_t vlmax = VSETVL_MAX_M1(); if(inc_x == 1){