Skip to content

Commit 0c59ae0

Browse files
authored
Merge pull request #5453 from pratiklp00/dgemm_optimization
Dgemm loop unroll and 4x1, 4x2 dgemv VSX implementation for power10.
2 parents 585e6d0 + 6637352 commit 0c59ae0

File tree

2 files changed

+107
-16
lines changed

2 files changed

+107
-16
lines changed

kernel/power/dgemm_kernel_power10.c

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,18 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
9292
rowC = (v4sf_t *) &CO[1* ldc+J]; \
9393
rowC[0] += result[1] * alpha;
9494
#endif
95-
95+
#define KERNEL(i) \
96+
rowA = (vec_t *)&AO[i<< 3];\
97+
rowB = *((__vector_pair *)((void *)&BO[i << 3]));\
98+
rowB1 = *((__vector_pair *)((void *)&BO[(i << 3) + 4]));\
99+
__builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
100+
__builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
101+
__builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
102+
__builtin_mma_xvf64gerpp(&acc3, rowB1, rowA[1]);\
103+
__builtin_mma_xvf64gerpp(&acc4, rowB, rowA[2]);\
104+
__builtin_mma_xvf64gerpp(&acc5, rowB1, rowA[2]);\
105+
__builtin_mma_xvf64gerpp(&acc6, rowB, rowA[3]);\
106+
__builtin_mma_xvf64gerpp(&acc7, rowB1, rowA[3]);
96107
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
97108

98109
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -188,7 +199,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
188199
v4sf_t *rowC;
189200
v4sf_t result[4];
190201
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
191-
BLASLONG l = 0;
202+
BLASLONG l = 1;
192203
vec_t *rowA = (vec_t *) & AO[0];
193204
__vector_pair rowB, rowB1;
194205
rowB = *((__vector_pair *)((void *)&BO[0]));
@@ -201,20 +212,55 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
201212
__builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
202213
__builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
203214
__builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
204-
for (l = 1; l < temp; l++)
205-
{
206-
rowA = (vec_t *) & AO[l << 3];
207-
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
208-
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
209-
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
210-
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
211-
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
212-
__builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
213-
__builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
214-
__builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
215-
__builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
216-
__builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
217-
}
215+
for (l = 1; l + 15 < temp; l += 16)
216+
{
217+
KERNEL (l);
218+
KERNEL (l+1);
219+
KERNEL (l+2);
220+
KERNEL (l+3);
221+
KERNEL (l+4);
222+
KERNEL (l+5);
223+
KERNEL (l+6);
224+
KERNEL (l+7);
225+
KERNEL (l+8);
226+
KERNEL (l+9);
227+
KERNEL (l+10);
228+
KERNEL (l+11);
229+
KERNEL (l+12);
230+
KERNEL (l+13);
231+
KERNEL (l+14);
232+
KERNEL (l+15);
233+
}
234+
if ((temp - l) & 8)
235+
{
236+
KERNEL(l);
237+
KERNEL(l+1);
238+
KERNEL(l+2);
239+
KERNEL(l+3);
240+
KERNEL(l+4);
241+
KERNEL(l+5);
242+
KERNEL(l+6);
243+
KERNEL(l+7);
244+
l += 8;
245+
}
246+
if ((temp - l) & 4)
247+
{
248+
KERNEL(l);
249+
KERNEL(l+1);
250+
KERNEL(l+2);
251+
KERNEL(l+3);
252+
l += 4;
253+
}
254+
if ((temp - l) & 2)
255+
{
256+
KERNEL(l);
257+
KERNEL(l+1);
258+
l += 2;
259+
}
260+
if ((temp - l) & 1)
261+
{
262+
KERNEL(l);
263+
}
218264
SAVE_ACC (&acc0, 0);
219265
SAVE_ACC1 (&acc1, 0);
220266
SAVE_ACC (&acc2, 2);

kernel/power/dgemv_n_microk_power10.c

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,53 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

28+
#define HAVE_KERNEL_4x2 1
29+
#define HAVE_KERNEL_4x1 1
2830
#define HAVE_KERNEL_4x4 1
2931

32+
static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
33+
{
34+
FLOAT x0,x1;
35+
x0 = xo[0] * alpha;
36+
x1 = xo[1] * alpha;
37+
__vector double v_x0 = {x0,x0};
38+
__vector double v_x1 = {x1,x1};
39+
__vector double* v_y =(__vector double*)y;
40+
__vector double* va0 = (__vector double*)a0;
41+
__vector double* va1 = (__vector double*)a1;
42+
for (int i=0; i< n/2; i+=2)
43+
{
44+
45+
v_y[i]+= va0[i] * v_x0 + va1[i] * v_x1;
46+
v_y[i+1]+= va0[i+1] * v_x0 + va1[i+1] * v_x1;
47+
48+
}
49+
50+
51+
}
52+
53+
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
54+
{
55+
BLASLONG i;
56+
FLOAT x[1] __attribute__ ((aligned (16)));
57+
58+
FLOAT x0,x1;
59+
x0 = xo[0] * alpha;
60+
61+
__vector double v_x0 = {x0,x0};
62+
__vector double* v_y =(__vector double*)y;
63+
__vector double* va0 = (__vector double*)a0;
64+
for (int i=0; i< n/2; i+=2)
65+
{
66+
67+
v_y[i]+= va0[i] * v_x0 ;
68+
v_y[i+1]+= va0[i+1] * v_x0 ;
69+
70+
}
71+
72+
}
73+
74+
3075
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
3176
{
3277
double *a0;

0 commit comments

Comments
 (0)