Skip to content

Commit 19a5a3e

Browse files
sirus20x6Aaron
andauthored
ggml : Leverage the existing GGML_F32_VEC helpers to vectorize ggml_vec_set_f32 for faster fills (ggml-org#16522)
* Leverage the existing GGML_F32_VEC helpers to broadcast the fill value across SIMD registers and store in vector-sized chunks, while retaining the scalar tail for leftover elements and non-SIMD builds. * Vectorize additional f32 helper loops * Normalize f32 helper tails for ggml vec ops --------- Co-authored-by: Aaron <shelhamer.aaron@gmail.com>
1 parent d8eaa26 commit 19a5a3e

File tree

1 file changed

+91
-5
lines changed

1 file changed

+91
-5
lines changed

ggml/src/ggml-cpu/vec.h

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,16 +77,85 @@ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp
7777
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
7878
}
7979
}
80-
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
81-
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
82-
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
80+
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) {
81+
int i = 0;
82+
#if defined(GGML_SIMD)
83+
const int np = (n & ~(GGML_F32_STEP - 1));
84+
85+
GGML_F32_VEC vv = GGML_F32_VEC_SET1(v);
86+
87+
for (; i < np; i += GGML_F32_STEP) {
88+
for (int j = 0; j < GGML_F32_ARR; ++j) {
89+
GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
90+
GGML_F32_VEC az = GGML_F32_VEC_ADD(ax, vv);
91+
GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az);
92+
}
93+
}
94+
#endif
95+
for (; i < n; ++i) {
96+
z[i] = x[i] + v;
97+
}
98+
}
99+
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) {
100+
int i = 0;
101+
#if defined(GGML_SIMD)
102+
const int np = (n & ~(GGML_F32_STEP - 1));
103+
104+
for (; i < np; i += GGML_F32_STEP) {
105+
for (int j = 0; j < GGML_F32_ARR; ++j) {
106+
GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
107+
GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
108+
ay = GGML_F32_VEC_ADD(ay, ax);
109+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay);
110+
}
111+
}
112+
#endif
113+
for (; i < n; ++i) {
114+
y[i] += x[i];
115+
}
116+
}
117+
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) {
118+
int i = 0;
119+
#if defined(GGML_SIMD)
120+
const int np = (n & ~(GGML_F32_STEP - 1));
121+
122+
GGML_F32_VEC vv = GGML_F32_VEC_SET1(v);
123+
124+
for (; i < np; i += GGML_F32_STEP) {
125+
for (int j = 0; j < GGML_F32_ARR; ++j) {
126+
GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
127+
ay = GGML_F32_VEC_ADD(ay, vv);
128+
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay);
129+
}
130+
}
131+
#endif
132+
for (; i < n; ++i) {
133+
y[i] += v;
134+
}
135+
}
83136
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
84137
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
85138
for (int i = 0; i < n; ++i) {
86139
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
87140
}
88141
}
89-
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
142+
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) {
143+
int i = 0;
144+
#if defined(GGML_SIMD)
145+
const int np = (n & ~(GGML_F32_STEP - 1));
146+
147+
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
148+
149+
for (; i < np; i += GGML_F32_STEP) {
150+
for (int j = 0; j < GGML_F32_ARR; ++j) {
151+
GGML_F32_VEC_STORE(x + i + j*GGML_F32_EPR, vx);
152+
}
153+
}
154+
#endif
155+
for (; i < n; ++i) {
156+
x[i] = v;
157+
}
158+
}
90159
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
91160
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
92161
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
@@ -95,7 +164,24 @@ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp
95164
}
96165
}
97166

98-
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
167+
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) {
168+
int i = 0;
169+
#if defined(GGML_SIMD)
170+
const int np = (n & ~(GGML_F32_STEP - 1));
171+
172+
for (; i < np; i += GGML_F32_STEP) {
173+
for (int j = 0; j < GGML_F32_ARR; ++j) {
174+
GGML_F32_VEC ax = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
175+
GGML_F32_VEC ay = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
176+
GGML_F32_VEC az = GGML_F32_VEC_MUL(ax, ay);
177+
GGML_F32_VEC_STORE(z + i + j*GGML_F32_EPR, az);
178+
}
179+
}
180+
#endif
181+
for (; i < n; ++i) {
182+
z[i] = x[i]*y[i];
183+
}
184+
}
99185
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
100186
for (int i = 0; i < n; ++i) {
101187
z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));

0 commit comments

Comments
 (0)