@@ -92,7 +92,18 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
9292 rowC = (v4sf_t *) &CO[1* ldc+J]; \
9393 rowC[0] += result[1] * alpha;
9494#endif
95-
95+ #define KERNEL (i ) \
96+ rowA = (vec_t *)&AO[i<< 3];\
97+ rowB = *((__vector_pair *)((void *)&BO[i << 3]));\
98+ rowB1 = *((__vector_pair *)((void *)&BO[(i << 3) + 4]));\
99+ __builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\
100+ __builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\
101+ __builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\
102+ __builtin_mma_xvf64gerpp(&acc3, rowB1, rowA[1]);\
103+ __builtin_mma_xvf64gerpp(&acc4, rowB, rowA[2]);\
104+ __builtin_mma_xvf64gerpp(&acc5, rowB1, rowA[2]);\
105+ __builtin_mma_xvf64gerpp(&acc6, rowB, rowA[3]);\
106+ __builtin_mma_xvf64gerpp(&acc7, rowB1, rowA[3]);
96107#define PREFETCH1 (x , y ) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
97108
98109#if (defined(LEFT ) && !defined(TRANSA )) || (!defined(LEFT ) && defined(TRANSA ))
@@ -188,7 +199,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
188199 v4sf_t * rowC ;
189200 v4sf_t result [4 ];
190201 __vector_quad acc0 , acc1 , acc2 , acc3 , acc4 ,acc5 ,acc6 ,acc7 ;
191- BLASLONG l = 0 ;
202+ BLASLONG l = 1 ;
192203 vec_t * rowA = (vec_t * ) & AO [0 ];
193204 __vector_pair rowB , rowB1 ;
194205 rowB = * ((__vector_pair * )((void * )& BO [0 ]));
@@ -201,20 +212,55 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
201212 __builtin_mma_xvf64ger (& acc5 , rowB1 , rowA [2 ]);
202213 __builtin_mma_xvf64ger (& acc6 , rowB , rowA [3 ]);
203214 __builtin_mma_xvf64ger (& acc7 , rowB1 , rowA [3 ]);
204- for (l = 1 ; l < temp ; l ++ )
205- {
206- rowA = (vec_t * ) & AO [l << 3 ];
207- rowB = * ((__vector_pair * )((void * )& BO [l << 3 ]));
208- rowB1 = * ((__vector_pair * )((void * )& BO [(l << 3 ) + 4 ]));
209- __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
210- __builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
211- __builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [1 ]);
212- __builtin_mma_xvf64gerpp (& acc3 , rowB1 , rowA [1 ]);
213- __builtin_mma_xvf64gerpp (& acc4 , rowB , rowA [2 ]);
214- __builtin_mma_xvf64gerpp (& acc5 , rowB1 , rowA [2 ]);
215- __builtin_mma_xvf64gerpp (& acc6 , rowB , rowA [3 ]);
216- __builtin_mma_xvf64gerpp (& acc7 , rowB1 , rowA [3 ]);
217- }
215+ for (l = 1 ; l + 15 < temp ; l += 16 )
216+ {
217+ KERNEL (l );
218+ KERNEL (l + 1 );
219+ KERNEL (l + 2 );
220+ KERNEL (l + 3 );
221+ KERNEL (l + 4 );
222+ KERNEL (l + 5 );
223+ KERNEL (l + 6 );
224+ KERNEL (l + 7 );
225+ KERNEL (l + 8 );
226+ KERNEL (l + 9 );
227+ KERNEL (l + 10 );
228+ KERNEL (l + 11 );
229+ KERNEL (l + 12 );
230+ KERNEL (l + 13 );
231+ KERNEL (l + 14 );
232+ KERNEL (l + 15 );
233+ }
234+ if ((temp - l ) & 8 )
235+ {
236+ KERNEL (l );
237+ KERNEL (l + 1 );
238+ KERNEL (l + 2 );
239+ KERNEL (l + 3 );
240+ KERNEL (l + 4 );
241+ KERNEL (l + 5 );
242+ KERNEL (l + 6 );
243+ KERNEL (l + 7 );
244+ l += 8 ;
245+ }
246+ if ((temp - l ) & 4 )
247+ {
248+ KERNEL (l );
249+ KERNEL (l + 1 );
250+ KERNEL (l + 2 );
251+ KERNEL (l + 3 );
252+ l += 4 ;
253+ }
254+ if ((temp - l ) & 2 )
255+ {
256+ KERNEL (l );
257+ KERNEL (l + 1 );
258+ l += 2 ;
259+ }
260+ if ((temp - l ) & 1 )
261+ {
262+ KERNEL (l );
263+ }
218264 SAVE_ACC (& acc0 , 0 );
219265 SAVE_ACC1 (& acc1 , 0 );
220266 SAVE_ACC (& acc2 , 2 );
0 commit comments