-
Notifications
You must be signed in to change notification settings - Fork 3
/
winograd.c
1260 lines (1128 loc) · 59 KB
/
winograd.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <string.h>
#include <assert.h>
#include <immintrin.h>
#include <mkl.h>
#include <sys/time.h>
extern long ISTRIDE;
extern long FSTRIDE;
extern long OSTRIDE;
static void get_tiles_4x3_16t(long x, long y, long nrows, const float *dataSrc,
float *dataDst, long *counter)
{
const long coter = *counter;
__m512 bufA[36];
__m512 bufB, bufC, bufD, bufE, bufF, bufG, bufH, bufI;
__m512i idx0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16,
14, 12, 10, 8, 6, 4, 2, 0);
__m512i idx1 = _mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17,
15, 13, 11, 9, 7, 5, 3, 1);
/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 */
/* 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30
1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31
32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62
33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63 */
/* 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60 1, 3 permute0
1 5 9 13 17 21 25 29 33 37 41 45 49 53 57 61 2, 4 permute0
2 6 10 14 18 22 26 30 34 38 42 46 50 54 58 62 1, 3 permute1
3 7 11 15 19 23 27 31 35 39 43 47 51 55 59 63 2, 4 permute1 */
/* 0, 1, 2, 3, 4, 5 */
bufB = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y);
bufC = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y + 16);
bufD = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y + 32);
bufE = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y + 48);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[0] = _mm512_permutex2var_ps(bufF, idx0, bufH);
bufA[1] = _mm512_permutex2var_ps(bufG, idx0, bufI);
bufA[2] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[3] = _mm512_permutex2var_ps(bufG, idx1, bufI);
bufB = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y + 2);
bufC = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y + 18);
bufD = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y + 34);
bufE = _mm512_loadu_ps(dataSrc + (x+0) * nrows + y + 50);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[4] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[5] = _mm512_permutex2var_ps(bufG, idx1, bufI);
/* 6, 7, 8, 9, 10, 11 */
bufB = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y);
bufC = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y + 16);
bufD = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y + 32);
bufE = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y + 48);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[6] = _mm512_permutex2var_ps(bufF, idx0, bufH);
bufA[7] = _mm512_permutex2var_ps(bufG, idx0, bufI);
bufA[8] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[9] = _mm512_permutex2var_ps(bufG, idx1, bufI);
bufB = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y + 2);
bufC = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y + 18);
bufD = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y + 34);
bufE = _mm512_loadu_ps(dataSrc + (x+1) * nrows + y + 50);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[10] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[11] = _mm512_permutex2var_ps(bufG, idx1, bufI);
/* 12, 13, 14, 15, 16, 17 */
bufB = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y);
bufC = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y + 16);
bufD = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y + 32);
bufE = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y + 48);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[12] = _mm512_permutex2var_ps(bufF, idx0, bufH);
bufA[13] = _mm512_permutex2var_ps(bufG, idx0, bufI);
bufA[14] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[15] = _mm512_permutex2var_ps(bufG, idx1, bufI);
bufB = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y + 2);
bufC = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y + 18);
bufD = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y + 34);
bufE = _mm512_loadu_ps(dataSrc + (x+2) * nrows + y + 50);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[16] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[17] = _mm512_permutex2var_ps(bufG, idx1, bufI);
/* 18, 19, 20, 21, 22, 23 */
bufB = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y);
bufC = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y + 16);
bufD = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y + 32);
bufE = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y + 48);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[18] = _mm512_permutex2var_ps(bufF, idx0, bufH);
bufA[19] = _mm512_permutex2var_ps(bufG, idx0, bufI);
bufA[20] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[21] = _mm512_permutex2var_ps(bufG, idx1, bufI);
bufB = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y + 2);
bufC = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y + 18);
bufD = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y + 34);
bufE = _mm512_loadu_ps(dataSrc + (x+3) * nrows + y + 50);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[22] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[23] = _mm512_permutex2var_ps(bufG, idx1, bufI);
/* 24, 25, 26, 27, 28, 29 */
bufB = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y);
bufC = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y + 16);
bufD = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y + 32);
bufE = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y + 48);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[24] = _mm512_permutex2var_ps(bufF, idx0, bufH);
bufA[25] = _mm512_permutex2var_ps(bufG, idx0, bufI);
bufA[26] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[27] = _mm512_permutex2var_ps(bufG, idx1, bufI);
bufB = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y + 2);
bufC = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y + 18);
bufD = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y + 34);
bufE = _mm512_loadu_ps(dataSrc + (x+4) * nrows + y + 50);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[28] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[29] = _mm512_permutex2var_ps(bufG, idx1, bufI);
/* 30, 31, 32, 33, 34, 35 */
bufB = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y);
bufC = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y + 16);
bufD = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y + 32);
bufE = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y + 48);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[30] = _mm512_permutex2var_ps(bufF, idx0, bufH);
bufA[31] = _mm512_permutex2var_ps(bufG, idx0, bufI);
bufA[32] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[33] = _mm512_permutex2var_ps(bufG, idx1, bufI);
bufB = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y + 2);
bufC = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y + 18);
bufD = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y + 34);
bufE = _mm512_loadu_ps(dataSrc + (x+5) * nrows + y + 50);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufC);
bufG = _mm512_permutex2var_ps(bufB, idx1, bufC);
bufH = _mm512_permutex2var_ps(bufD, idx0, bufE);
bufI = _mm512_permutex2var_ps(bufD, idx1, bufE);
bufA[34] = _mm512_permutex2var_ps(bufF, idx1, bufH);
bufA[35] = _mm512_permutex2var_ps(bufG, idx1, bufI);
__m512 bufTemp[36];
__m512 m0 = _mm512_setzero_ps();
__m512 m1 = _mm512_set1_ps(1.0f);
__m512 m2 = _mm512_set1_ps(2.0f);
__m512 m4 = _mm512_set1_ps(4.0f);
__m512 m5 = _mm512_set1_ps(5.0f);
bufTemp[0] = _mm512_mul_ps(m4, bufA[0]);
bufTemp[1] = _mm512_mul_ps(m4, bufA[1]);
bufTemp[2] = _mm512_mul_ps(m4, bufA[2]);
bufTemp[3] = _mm512_mul_ps(m4, bufA[3]);
bufTemp[4] = _mm512_mul_ps(m4, bufA[4]);
bufTemp[5] = _mm512_mul_ps(m4, bufA[5]);
bufTemp[0] = _mm512_fnmadd_ps(m5, bufA[12], bufTemp[0]);
bufTemp[1] = _mm512_fnmadd_ps(m5, bufA[13], bufTemp[1]);
bufTemp[2] = _mm512_fnmadd_ps(m5, bufA[14], bufTemp[2]);
bufTemp[3] = _mm512_fnmadd_ps(m5, bufA[15], bufTemp[3]);
bufTemp[4] = _mm512_fnmadd_ps(m5, bufA[16], bufTemp[4]);
bufTemp[5] = _mm512_fnmadd_ps(m5, bufA[17], bufTemp[5]);
bufTemp[0] = _mm512_add_ps(bufA[24], bufTemp[0]);
bufTemp[1] = _mm512_add_ps(bufA[25], bufTemp[1]);
bufTemp[2] = _mm512_add_ps(bufA[26], bufTemp[2]);
bufTemp[3] = _mm512_add_ps(bufA[27], bufTemp[3]);
bufTemp[4] = _mm512_add_ps(bufA[28], bufTemp[4]);
bufTemp[5] = _mm512_add_ps(bufA[29], bufTemp[5]);
bufTemp[6] = _mm512_fnmadd_ps(m4, bufA[6], m0);
bufTemp[7] = _mm512_fnmadd_ps(m4, bufA[7], m0);
bufTemp[8] = _mm512_fnmadd_ps(m4, bufA[8], m0);
bufTemp[9] = _mm512_fnmadd_ps(m4, bufA[9], m0);
bufTemp[10] = _mm512_fnmadd_ps(m4, bufA[10], m0);
bufTemp[11] = _mm512_fnmadd_ps(m4, bufA[11], m0);
bufTemp[6] = _mm512_fnmadd_ps(m4, bufA[12], bufTemp[6]);
bufTemp[7] = _mm512_fnmadd_ps(m4, bufA[13], bufTemp[7]);
bufTemp[8] = _mm512_fnmadd_ps(m4, bufA[14], bufTemp[8]);
bufTemp[9] = _mm512_fnmadd_ps(m4, bufA[15], bufTemp[9]);
bufTemp[10] = _mm512_fnmadd_ps(m4, bufA[16], bufTemp[10]);
bufTemp[11] = _mm512_fnmadd_ps(m4, bufA[17], bufTemp[11]);
bufTemp[6] = _mm512_add_ps(bufA[18], bufTemp[6]);
bufTemp[7] = _mm512_add_ps(bufA[19], bufTemp[7]);
bufTemp[8] = _mm512_add_ps(bufA[20], bufTemp[8]);
bufTemp[9] = _mm512_add_ps(bufA[21], bufTemp[9]);
bufTemp[10] = _mm512_add_ps(bufA[22], bufTemp[10]);
bufTemp[11] = _mm512_add_ps(bufA[23], bufTemp[11]);
bufTemp[6] = _mm512_add_ps(bufA[24], bufTemp[6]);
bufTemp[7] = _mm512_add_ps(bufA[25], bufTemp[7]);
bufTemp[8] = _mm512_add_ps(bufA[26], bufTemp[8]);
bufTemp[9] = _mm512_add_ps(bufA[27], bufTemp[9]);
bufTemp[10] = _mm512_add_ps(bufA[28], bufTemp[10]);
bufTemp[11] = _mm512_add_ps(bufA[29], bufTemp[11]);
bufTemp[12] = _mm512_mul_ps(m4, bufA[6]);
bufTemp[13] = _mm512_mul_ps(m4, bufA[7]);
bufTemp[14] = _mm512_mul_ps(m4, bufA[8]);
bufTemp[15] = _mm512_mul_ps(m4, bufA[9]);
bufTemp[16] = _mm512_mul_ps(m4, bufA[10]);
bufTemp[17] = _mm512_mul_ps(m4, bufA[11]);
bufTemp[12] = _mm512_fnmadd_ps(m4, bufA[12], bufTemp[12]);
bufTemp[13] = _mm512_fnmadd_ps(m4, bufA[13], bufTemp[13]);
bufTemp[14] = _mm512_fnmadd_ps(m4, bufA[14], bufTemp[14]);
bufTemp[15] = _mm512_fnmadd_ps(m4, bufA[15], bufTemp[15]);
bufTemp[16] = _mm512_fnmadd_ps(m4, bufA[16], bufTemp[16]);
bufTemp[17] = _mm512_fnmadd_ps(m4, bufA[17], bufTemp[17]);
bufTemp[12] = _mm512_sub_ps(bufTemp[12], bufA[18]);
bufTemp[13] = _mm512_sub_ps(bufTemp[13], bufA[19]);
bufTemp[14] = _mm512_sub_ps(bufTemp[14], bufA[20]);
bufTemp[15] = _mm512_sub_ps(bufTemp[15], bufA[21]);
bufTemp[16] = _mm512_sub_ps(bufTemp[16], bufA[22]);
bufTemp[17] = _mm512_sub_ps(bufTemp[17], bufA[23]);
bufTemp[12] = _mm512_add_ps(bufTemp[12], bufA[24]);
bufTemp[13] = _mm512_add_ps(bufTemp[13], bufA[25]);
bufTemp[14] = _mm512_add_ps(bufTemp[14], bufA[26]);
bufTemp[15] = _mm512_add_ps(bufTemp[15], bufA[27]);
bufTemp[16] = _mm512_add_ps(bufTemp[16], bufA[28]);
bufTemp[17] = _mm512_add_ps(bufTemp[17], bufA[29]);
bufTemp[18] = _mm512_fnmadd_ps(m2, bufA[6], m0);
bufTemp[19] = _mm512_fnmadd_ps(m2, bufA[7], m0);
bufTemp[20] = _mm512_fnmadd_ps(m2, bufA[8], m0);
bufTemp[21] = _mm512_fnmadd_ps(m2, bufA[9], m0);
bufTemp[22] = _mm512_fnmadd_ps(m2, bufA[10], m0);
bufTemp[23] = _mm512_fnmadd_ps(m2, bufA[11], m0);
bufTemp[18] = _mm512_sub_ps(bufTemp[18], bufA[12]);
bufTemp[19] = _mm512_sub_ps(bufTemp[19], bufA[13]);
bufTemp[20] = _mm512_sub_ps(bufTemp[20], bufA[14]);
bufTemp[21] = _mm512_sub_ps(bufTemp[21], bufA[15]);
bufTemp[22] = _mm512_sub_ps(bufTemp[22], bufA[16]);
bufTemp[23] = _mm512_sub_ps(bufTemp[23], bufA[17]);
bufTemp[18] = _mm512_fmadd_ps(m2, bufA[18], bufTemp[18]);
bufTemp[19] = _mm512_fmadd_ps(m2, bufA[19], bufTemp[19]);
bufTemp[20] = _mm512_fmadd_ps(m2, bufA[20], bufTemp[20]);
bufTemp[21] = _mm512_fmadd_ps(m2, bufA[21], bufTemp[21]);
bufTemp[22] = _mm512_fmadd_ps(m2, bufA[22], bufTemp[22]);
bufTemp[23] = _mm512_fmadd_ps(m2, bufA[23], bufTemp[23]);
bufTemp[18] = _mm512_add_ps(bufTemp[18], bufA[24]);
bufTemp[19] = _mm512_add_ps(bufTemp[19], bufA[25]);
bufTemp[20] = _mm512_add_ps(bufTemp[20], bufA[26]);
bufTemp[21] = _mm512_add_ps(bufTemp[21], bufA[27]);
bufTemp[22] = _mm512_add_ps(bufTemp[22], bufA[28]);
bufTemp[23] = _mm512_add_ps(bufTemp[23], bufA[29]);
bufTemp[24] = _mm512_mul_ps(m2, bufA[6]);
bufTemp[25] = _mm512_mul_ps(m2, bufA[7]);
bufTemp[26] = _mm512_mul_ps(m2, bufA[8]);
bufTemp[27] = _mm512_mul_ps(m2, bufA[9]);
bufTemp[28] = _mm512_mul_ps(m2, bufA[10]);
bufTemp[29] = _mm512_mul_ps(m2, bufA[11]);
bufTemp[24] = _mm512_sub_ps(bufTemp[24], bufA[12]);
bufTemp[25] = _mm512_sub_ps(bufTemp[25], bufA[13]);
bufTemp[26] = _mm512_sub_ps(bufTemp[26], bufA[14]);
bufTemp[27] = _mm512_sub_ps(bufTemp[27], bufA[15]);
bufTemp[28] = _mm512_sub_ps(bufTemp[28], bufA[16]);
bufTemp[29] = _mm512_sub_ps(bufTemp[29], bufA[17]);
bufTemp[24] = _mm512_fnmadd_ps(m2, bufA[18], bufTemp[24]);
bufTemp[25] = _mm512_fnmadd_ps(m2, bufA[19], bufTemp[25]);
bufTemp[26] = _mm512_fnmadd_ps(m2, bufA[20], bufTemp[26]);
bufTemp[27] = _mm512_fnmadd_ps(m2, bufA[21], bufTemp[27]);
bufTemp[28] = _mm512_fnmadd_ps(m2, bufA[22], bufTemp[28]);
bufTemp[29] = _mm512_fnmadd_ps(m2, bufA[23], bufTemp[29]);
bufTemp[24] = _mm512_add_ps(bufTemp[24], bufA[24]);
bufTemp[25] = _mm512_add_ps(bufTemp[25], bufA[25]);
bufTemp[26] = _mm512_add_ps(bufTemp[26], bufA[26]);
bufTemp[27] = _mm512_add_ps(bufTemp[27], bufA[27]);
bufTemp[28] = _mm512_add_ps(bufTemp[28], bufA[28]);
bufTemp[29] = _mm512_add_ps(bufTemp[29], bufA[29]);
bufTemp[30] = _mm512_mul_ps(m4, bufA[6]);
bufTemp[31] = _mm512_mul_ps(m4, bufA[7]);
bufTemp[32] = _mm512_mul_ps(m4, bufA[8]);
bufTemp[33] = _mm512_mul_ps(m4, bufA[9]);
bufTemp[34] = _mm512_mul_ps(m4, bufA[10]);
bufTemp[35] = _mm512_mul_ps(m4, bufA[11]);
bufTemp[30] = _mm512_fnmadd_ps(m5, bufA[18], bufTemp[30]);
bufTemp[31] = _mm512_fnmadd_ps(m5, bufA[19], bufTemp[31]);
bufTemp[32] = _mm512_fnmadd_ps(m5, bufA[20], bufTemp[32]);
bufTemp[33] = _mm512_fnmadd_ps(m5, bufA[21], bufTemp[33]);
bufTemp[34] = _mm512_fnmadd_ps(m5, bufA[22], bufTemp[34]);
bufTemp[35] = _mm512_fnmadd_ps(m5, bufA[23], bufTemp[35]);
bufTemp[30] = _mm512_add_ps(bufTemp[30], bufA[30]);
bufTemp[31] = _mm512_add_ps(bufTemp[31], bufA[31]);
bufTemp[32] = _mm512_add_ps(bufTemp[32], bufA[32]);
bufTemp[33] = _mm512_add_ps(bufTemp[33], bufA[33]);
bufTemp[34] = _mm512_add_ps(bufTemp[34], bufA[34]);
bufTemp[35] = _mm512_add_ps(bufTemp[35], bufA[35]);
/* 4 0 0 0 0 0
0 -4 4 -2 2 4
-5 -4 -4 -1 -1 0
0 1 -1 2 -2 -5
1 1 1 1 1 0
0 0 0 0 0 1 */
bufB = _mm512_mul_ps(bufTemp[0], m4);
bufB = _mm512_fnmadd_ps(m5, bufTemp[2], bufB);
bufB = _mm512_add_ps(bufB, bufTemp[4]);
_mm512_storeu_ps(dataDst + 0 * ISTRIDE + coter, bufB);
bufC = _mm512_fnmadd_ps(m4, bufTemp[1], m0);
bufC = _mm512_fnmadd_ps(m4, bufTemp[2], bufC);
bufC = _mm512_add_ps(bufTemp[3], bufC);
bufC = _mm512_add_ps(bufTemp[4], bufC);
_mm512_storeu_ps(dataDst + 1 * ISTRIDE + coter, bufC);
bufD = _mm512_mul_ps(m4, bufTemp[1]);
bufD = _mm512_fnmadd_ps(m4, bufTemp[2], bufD);
bufD = _mm512_sub_ps(bufD, bufTemp[3]);
bufD = _mm512_add_ps(bufD, bufTemp[4]);
_mm512_storeu_ps(dataDst + 2 * ISTRIDE + coter, bufD);
bufE = _mm512_fnmadd_ps(m2, bufTemp[1], m0);
bufE = _mm512_sub_ps(bufE, bufTemp[2]);
bufE = _mm512_fmadd_ps(m2, bufTemp[3], bufE);
bufE = _mm512_add_ps(bufE, bufTemp[4]);
_mm512_storeu_ps(dataDst + 3 * ISTRIDE + coter, bufE);
bufF = _mm512_mul_ps(m2, bufTemp[1]);
bufF = _mm512_sub_ps(bufF, bufTemp[2]);
bufF = _mm512_fnmadd_ps(m2, bufTemp[3], bufF);
bufF = _mm512_add_ps(bufF, bufTemp[4]);
_mm512_storeu_ps(dataDst + 4 * ISTRIDE + coter, bufF);
bufG = _mm512_mul_ps(m4, bufTemp[1]);
bufG = _mm512_fnmadd_ps(m5, bufTemp[3], bufG);
bufG = _mm512_add_ps(bufG, bufTemp[5]);
_mm512_storeu_ps(dataDst + 5 * ISTRIDE + coter, bufG);
// -------------------------------------------------------
bufB = _mm512_mul_ps(bufTemp[6], m4);
bufB = _mm512_fnmadd_ps(m5, bufTemp[8], bufB);
bufB = _mm512_add_ps(bufB, bufTemp[10]);
_mm512_storeu_ps(dataDst + 6 * ISTRIDE + coter, bufB);
bufC = _mm512_fnmadd_ps(m4, bufTemp[7], m0);
bufC = _mm512_fnmadd_ps(m4, bufTemp[8], bufC);
bufC = _mm512_add_ps(bufTemp[9], bufC);
bufC = _mm512_add_ps(bufTemp[10], bufC);
_mm512_storeu_ps(dataDst + 7 * ISTRIDE + coter, bufC);
bufD = _mm512_mul_ps(m4, bufTemp[7]);
bufD = _mm512_fnmadd_ps(m4, bufTemp[8], bufD);
bufD = _mm512_sub_ps(bufD, bufTemp[9]);
bufD = _mm512_add_ps(bufD, bufTemp[10]);
_mm512_storeu_ps(dataDst + 8 * ISTRIDE + coter, bufD);
bufE = _mm512_fnmadd_ps(m2, bufTemp[7], m0);
bufE = _mm512_sub_ps(bufE, bufTemp[8]);
bufE = _mm512_fmadd_ps(m2, bufTemp[9], bufE);
bufE = _mm512_add_ps(bufE, bufTemp[10]);
_mm512_storeu_ps(dataDst + 9 * ISTRIDE + coter, bufE);
bufF = _mm512_mul_ps(m2, bufTemp[7]);
bufF = _mm512_sub_ps(bufF, bufTemp[8]);
bufF = _mm512_fnmadd_ps(m2, bufTemp[9], bufF);
bufF = _mm512_add_ps(bufF, bufTemp[10]);
_mm512_storeu_ps(dataDst + 10 * ISTRIDE + coter, bufF);
bufG = _mm512_mul_ps(m4, bufTemp[7]);
bufG = _mm512_fnmadd_ps(m5, bufTemp[9], bufG);
bufG = _mm512_add_ps(bufG, bufTemp[11]);
_mm512_storeu_ps(dataDst + 11 * ISTRIDE + coter, bufG);
// ------------------------------------
bufB = _mm512_mul_ps(bufTemp[12], m4);
bufB = _mm512_fnmadd_ps(m5, bufTemp[14], bufB);
bufB = _mm512_add_ps(bufB, bufTemp[16]);
_mm512_storeu_ps(dataDst + 12 * ISTRIDE + coter, bufB);
bufC = _mm512_fnmadd_ps(m4, bufTemp[13], m0);
bufC = _mm512_fnmadd_ps(m4, bufTemp[14], bufC);
bufC = _mm512_add_ps(bufTemp[15], bufC);
bufC = _mm512_add_ps(bufTemp[16], bufC);
_mm512_storeu_ps(dataDst + 13 * ISTRIDE + coter, bufC);
bufD = _mm512_mul_ps(m4, bufTemp[13]);
bufD = _mm512_fnmadd_ps(m4, bufTemp[14], bufD);
bufD = _mm512_sub_ps(bufD, bufTemp[15]);
bufD = _mm512_add_ps(bufD, bufTemp[16]);
_mm512_storeu_ps(dataDst + 14 * ISTRIDE + coter, bufD);
bufE = _mm512_fnmadd_ps(m2, bufTemp[13], m0);
bufE = _mm512_sub_ps(bufE, bufTemp[14]);
bufE = _mm512_fmadd_ps(m2, bufTemp[15], bufE);
bufE = _mm512_add_ps(bufE, bufTemp[16]);
_mm512_storeu_ps(dataDst + 15 * ISTRIDE + coter, bufE);
bufF = _mm512_mul_ps(m2, bufTemp[13]);
bufF = _mm512_sub_ps(bufF, bufTemp[14]);
bufF = _mm512_fnmadd_ps(m2, bufTemp[15], bufF);
bufF = _mm512_add_ps(bufF, bufTemp[16]);
_mm512_storeu_ps(dataDst + 16 * ISTRIDE + coter, bufF);
bufG = _mm512_mul_ps(m4, bufTemp[13]);
bufG = _mm512_fnmadd_ps(m5, bufTemp[15], bufG);
bufG = _mm512_add_ps(bufG, bufTemp[17]);
_mm512_storeu_ps(dataDst + 17 * ISTRIDE + coter, bufG);
// --------------------------------------------
bufB = _mm512_mul_ps(bufTemp[18], m4);
bufB = _mm512_fnmadd_ps(m5, bufTemp[20], bufB);
bufB = _mm512_add_ps(bufB, bufTemp[22]);
_mm512_storeu_ps(dataDst + 18 * ISTRIDE + coter, bufB);
bufC = _mm512_fnmadd_ps(m4, bufTemp[19], m0);
bufC = _mm512_fnmadd_ps(m4, bufTemp[20], bufC);
bufC = _mm512_add_ps(bufTemp[21], bufC);
bufC = _mm512_add_ps(bufTemp[22], bufC);
_mm512_storeu_ps(dataDst + 19 * ISTRIDE + coter, bufC);
bufD = _mm512_mul_ps(m4, bufTemp[19]);
bufD = _mm512_fnmadd_ps(m4, bufTemp[20], bufD);
bufD = _mm512_sub_ps(bufD, bufTemp[21]);
bufD = _mm512_add_ps(bufD, bufTemp[22]);
_mm512_storeu_ps(dataDst + 20 * ISTRIDE + coter, bufD);
bufE = _mm512_fnmadd_ps(m2, bufTemp[19], m0);
bufE = _mm512_sub_ps(bufE, bufTemp[20]);
bufE = _mm512_fmadd_ps(m2, bufTemp[21], bufE);
bufE = _mm512_add_ps(bufE, bufTemp[22]);
_mm512_storeu_ps(dataDst + 21 * ISTRIDE + coter, bufE);
bufF = _mm512_mul_ps(m2, bufTemp[19]);
bufF = _mm512_sub_ps(bufF, bufTemp[20]);
bufF = _mm512_fnmadd_ps(m2, bufTemp[21], bufF);
bufF = _mm512_add_ps(bufF, bufTemp[22]);
_mm512_storeu_ps(dataDst + 22 * ISTRIDE + coter, bufF);
bufG = _mm512_mul_ps(m4, bufTemp[19]);
bufG = _mm512_fnmadd_ps(m5, bufTemp[21], bufG);
bufG = _mm512_add_ps(bufG, bufTemp[23]);
_mm512_storeu_ps(dataDst + 23 * ISTRIDE + coter, bufG);
// --------------------------------------------
bufB = _mm512_mul_ps(bufTemp[24], m4);
bufB = _mm512_fnmadd_ps(m5, bufTemp[26], bufB);
bufB = _mm512_add_ps(bufB, bufTemp[28]);
_mm512_storeu_ps(dataDst + 24 * ISTRIDE + coter, bufB);
bufC = _mm512_fnmadd_ps(m4, bufTemp[25], m0);
bufC = _mm512_fnmadd_ps(m4, bufTemp[26], bufC);
bufC = _mm512_add_ps(bufTemp[27], bufC);
bufC = _mm512_add_ps(bufTemp[28], bufC);
_mm512_storeu_ps(dataDst + 25 * ISTRIDE + coter, bufC);
bufD = _mm512_mul_ps(m4, bufTemp[25]);
bufD = _mm512_fnmadd_ps(m4, bufTemp[26], bufD);
bufD = _mm512_sub_ps(bufD, bufTemp[27]);
bufD = _mm512_add_ps(bufD, bufTemp[28]);
_mm512_storeu_ps(dataDst + 26 * ISTRIDE + coter, bufD);
bufE = _mm512_fnmadd_ps(m2, bufTemp[25], m0);
bufE = _mm512_sub_ps(bufE, bufTemp[26]);
bufE = _mm512_fmadd_ps(m2, bufTemp[27], bufE);
bufE = _mm512_add_ps(bufE, bufTemp[28]);
_mm512_storeu_ps(dataDst + 27 * ISTRIDE + coter, bufE);
bufF = _mm512_mul_ps(m2, bufTemp[25]);//should be 25
bufF = _mm512_sub_ps(bufF, bufTemp[26]);
bufF = _mm512_fnmadd_ps(m2, bufTemp[27], bufF);
bufF = _mm512_add_ps(bufF, bufTemp[28]);
_mm512_storeu_ps(dataDst + 28 * ISTRIDE + coter, bufF);
bufG = _mm512_mul_ps(m4, bufTemp[25]);
bufG = _mm512_fnmadd_ps(m5, bufTemp[27], bufG);
bufG = _mm512_add_ps(bufG, bufTemp[29]);
_mm512_storeu_ps(dataDst + 29 * ISTRIDE + coter, bufG);
// ----------------------------------------
bufB = _mm512_mul_ps(bufTemp[30], m4);
bufB = _mm512_fnmadd_ps(m5, bufTemp[32], bufB);
bufB = _mm512_add_ps(bufB, bufTemp[34]);
_mm512_storeu_ps(dataDst + 30 * ISTRIDE + coter, bufB);
bufC = _mm512_fnmadd_ps(m4, bufTemp[31], m0);
bufC = _mm512_fnmadd_ps(m4, bufTemp[32], bufC);
bufC = _mm512_add_ps(bufTemp[33], bufC);
bufC = _mm512_add_ps(bufTemp[34], bufC);
_mm512_storeu_ps(dataDst + 31 * ISTRIDE + coter, bufC);
bufD = _mm512_mul_ps(m4, bufTemp[31]);
bufD = _mm512_fnmadd_ps(m4, bufTemp[32], bufD);
bufD = _mm512_sub_ps(bufD, bufTemp[33]);
bufD = _mm512_add_ps(bufD, bufTemp[34]);
_mm512_storeu_ps(dataDst + 32 * ISTRIDE + coter, bufD);
bufE = _mm512_fnmadd_ps(m2, bufTemp[31], m0);
bufE = _mm512_sub_ps(bufE, bufTemp[32]);
bufE = _mm512_fmadd_ps(m2, bufTemp[33], bufE);
bufE = _mm512_add_ps(bufE, bufTemp[34]);
_mm512_storeu_ps(dataDst + 33 * ISTRIDE + coter, bufE);
bufF = _mm512_mul_ps(m2, bufTemp[31]);
bufF = _mm512_sub_ps(bufF, bufTemp[32]);
bufF = _mm512_fnmadd_ps(m2, bufTemp[33], bufF);
bufF = _mm512_add_ps(bufF, bufTemp[34]);
_mm512_storeu_ps(dataDst + 34 * ISTRIDE + coter, bufF);
bufG = _mm512_mul_ps(m4, bufTemp[31]);
bufG = _mm512_fnmadd_ps(m5, bufTemp[33], bufG);
bufG = _mm512_add_ps(bufG, bufTemp[35]);
_mm512_storeu_ps(dataDst + 35 * ISTRIDE + coter, bufG);
*counter += 16;
}
static inline void pad_get_tiles(long x, long y, long lenX, long lenY, long nrows, const float *dataSrc,
float *temp, float *dataDst, long *counter) {
if (2 == lenX || 2 == lenY) return;
long i;
for (i = 0; i < lenX; ++i) {
memcpy(temp + i * 66, dataSrc + (x + i) * nrows + y, sizeof(float) * lenY);
memset(temp + i * 66 + lenY, 0, sizeof(float) * (66 - lenY));
}
memset(temp + i * 66, 0, sizeof(float) * (6 - i) * 66);
get_tiles_4x3_16t(0, 0, 66, temp, dataDst, counter);
}
static inline void get_tiles_4x3_1t(long x, long y, long nrows, const float *dataSrc,
float *dataDst, long *counter) {
long coter = *counter;
float temp[36] __attribute__((aligned(64)));
temp[0] = dataSrc[(x + 0) * nrows + y + 0];
temp[1] = dataSrc[(x + 0) * nrows + y + 1];
temp[2] = dataSrc[(x + 0) * nrows + y + 2];
temp[3] = dataSrc[(x + 0) * nrows + y + 3];
temp[4] = dataSrc[(x + 0) * nrows + y + 4];
temp[5] = dataSrc[(x + 0) * nrows + y + 5];
temp[6] = dataSrc[(x + 1) * nrows + y + 0];
temp[7] = dataSrc[(x + 1) * nrows + y + 1];
temp[8] = dataSrc[(x + 1) * nrows + y + 2];
temp[9] = dataSrc[(x + 1) * nrows + y + 3];
temp[10] = dataSrc[(x + 1) * nrows + y + 4];
temp[11] = dataSrc[(x + 1) * nrows + y + 5];
temp[12] = dataSrc[(x + 2) * nrows + y + 0];
temp[13] = dataSrc[(x + 2) * nrows + y + 1];
temp[14] = dataSrc[(x + 2) * nrows + y + 2];
temp[15] = dataSrc[(x + 2) * nrows + y + 3];
temp[16] = dataSrc[(x + 2) * nrows + y + 4];
temp[17] = dataSrc[(x + 2) * nrows + y + 5];
temp[18] = dataSrc[(x + 3) * nrows + y + 0];
temp[19] = dataSrc[(x + 3) * nrows + y + 1];
temp[20] = dataSrc[(x + 3) * nrows + y + 2];
temp[21] = dataSrc[(x + 3) * nrows + y + 3];
temp[22] = dataSrc[(x + 3) * nrows + y + 4];
temp[23] = dataSrc[(x + 3) * nrows + y + 5];
temp[24] = dataSrc[(x + 4) * nrows + y + 0];
temp[25] = dataSrc[(x + 4) * nrows + y + 1];
temp[26] = dataSrc[(x + 4) * nrows + y + 2];
temp[27] = dataSrc[(x + 4) * nrows + y + 3];
temp[28] = dataSrc[(x + 4) * nrows + y + 4];
temp[29] = dataSrc[(x + 4) * nrows + y + 5];
temp[30] = dataSrc[(x + 5) * nrows + y + 0];
temp[31] = dataSrc[(x + 5) * nrows + y + 1];
temp[32] = dataSrc[(x + 5) * nrows + y + 2];
temp[33] = dataSrc[(x + 5) * nrows + y + 3];
temp[34] = dataSrc[(x + 5) * nrows + y + 4];
temp[35] = dataSrc[(x + 5) * nrows + y + 5];
float temp2[36]__attribute__((aligned(64)));
temp2[0] = 4 * temp[0] - 5 * temp[12] + temp[24];
temp2[1] = 4 * temp[1] - 5 * temp[13] + temp[25];
temp2[2] = 4 * temp[2] - 5 * temp[14] + temp[26];
temp2[3] = 4 * temp[3] - 5 * temp[15] + temp[27];
temp2[4] = 4 * temp[4] - 5 * temp[16] + temp[28];
temp2[5] = 4 * temp[5] - 5 * temp[17] + temp[29];
temp2[6] = -4 * temp[6] - 4 * temp[12] + temp[18] + temp[24];
temp2[7] = -4 * temp[7] - 4 * temp[13] + temp[19] + temp[25];
temp2[8] = -4 * temp[8] - 4 * temp[14] + temp[20] + temp[26];
temp2[9] = -4 * temp[9] - 4 * temp[15] + temp[21] + temp[27];
temp2[10] = -4 * temp[10] - 4 * temp[16] + temp[22] + temp[28];
temp2[11] = -4 * temp[11] - 4 * temp[17] + temp[23] + temp[29];
temp2[12] = 4 * temp[6] - 4 * temp[12] - temp[18] + temp[24];
temp2[13] = 4 * temp[7] - 4 * temp[13] - temp[19] + temp[25];
temp2[14] = 4 * temp[8] - 4 * temp[14] - temp[20] + temp[26];
temp2[15] = 4 * temp[9] - 4 * temp[15] - temp[21] + temp[27];
temp2[16] = 4 * temp[10] - 4 * temp[16] - temp[22] + temp[28];
temp2[17] = 4 * temp[11] - 4 * temp[17] - temp[23] + temp[29];
temp2[18] = -2 * temp[6] - temp[12] + 2 * temp[18] + temp[24];
temp2[19] = -2 * temp[7] - temp[13] + 2 * temp[19] + temp[25];
temp2[20] = -2 * temp[8] - temp[14] + 2 * temp[20] + temp[26];
temp2[21] = -2 * temp[9] - temp[15] + 2 * temp[21] + temp[27];
temp2[22] = -2 * temp[10] - temp[16] + 2 * temp[22] + temp[28];
temp2[23] = -2 * temp[11] - temp[17] + 2 * temp[23] + temp[29];
temp2[24] = 2 * temp[6] - temp[12] - 2 * temp[18] + temp[24];
temp2[25] = 2 * temp[7] - temp[13] - 2 * temp[19] + temp[25];
temp2[26] = 2 * temp[8] - temp[14] - 2 * temp[20] + temp[26];
temp2[27] = 2 * temp[9] - temp[15] - 2 * temp[21] + temp[27];
temp2[28] = 2 * temp[10] - temp[16] - 2 * temp[22] + temp[28];
temp2[29] = 2 * temp[11] - temp[17] - 2 * temp[23] + temp[29];
temp2[30] = 4 * temp[6] - 5 * temp[18] + temp[30];
temp2[31] = 4 * temp[7] - 5 * temp[19] + temp[31];
temp2[32] = 4 * temp[8] - 5 * temp[20] + temp[32];
temp2[33] = 4 * temp[9] - 5 * temp[21] + temp[33];
temp2[34] = 4 * temp[10] - 5 * temp[22] + temp[34];
temp2[35] = 4 * temp[11] - 5 * temp[23] + temp[35];
dataDst[0 * ISTRIDE + coter] = temp2[0] * 4 - temp2[2] * 5 + temp2[4];
dataDst[1 * ISTRIDE + coter] = -temp2[1] * 4 - temp2[2] * 4 + temp2[3] + temp2[4];
dataDst[2 * ISTRIDE + coter] = temp2[1] * 4 - temp2[2] * 4 - temp2[3] + temp2[4];
dataDst[3 * ISTRIDE + coter] = -temp2[1] * 2 - temp2[2] + temp2[3] * 2 + temp2[4];
dataDst[4 * ISTRIDE + coter] = temp2[1] * 2 - temp2[2] - temp2[3] * 2 + temp2[4];
dataDst[5 * ISTRIDE + coter] = temp2[1] * 4 - temp2[3] * 5 + temp2[5];
dataDst[6 * ISTRIDE + coter] = temp2[6] * 4 - temp2[8] * 5 + temp2[10];
dataDst[7 * ISTRIDE + coter] = -temp2[7] * 4 - temp2[8] * 4 + temp2[9] + temp2[10];
dataDst[8 * ISTRIDE + coter] = temp2[7] * 4 - temp2[8] * 4 - temp2[9] + temp2[10];
dataDst[9 * ISTRIDE + coter] = -temp2[7] * 2 - temp2[8] + temp2[9] * 2 + temp2[10];
dataDst[10 * ISTRIDE + coter] = temp2[7] * 2 - temp2[8] - temp2[9] * 2 + temp2[10];
dataDst[11 * ISTRIDE + coter] = temp2[7] * 4 - temp2[9] * 5 + temp2[11];
dataDst[12 * ISTRIDE + coter] = temp2[12] * 4 - temp2[14] * 5 + temp2[16];
dataDst[13 * ISTRIDE + coter] = -temp2[13] * 4 - temp2[14] * 4 + temp2[15] + temp2[16];
dataDst[14 * ISTRIDE + coter] = temp2[13] * 4 - temp2[14] * 4 - temp2[15] + temp2[16];
dataDst[15 * ISTRIDE + coter] = -temp2[13] * 2 - temp2[14] + temp2[15] * 2 + temp2[16];
dataDst[16 * ISTRIDE + coter] = temp2[13] * 2 - temp2[14] - temp2[15] * 2 + temp2[16];
dataDst[17 * ISTRIDE + coter] = temp2[13] * 4 - temp2[15] * 5 + temp2[17];
dataDst[18 * ISTRIDE + coter] = temp2[18] * 4 - temp2[20] * 5 + temp2[22];
dataDst[19 * ISTRIDE + coter] = -temp2[19] * 4 - temp2[20] * 4 + temp2[21] + temp2[22];
dataDst[20 * ISTRIDE + coter] = temp2[19] * 4 - temp2[20] * 4 - temp2[21] + temp2[22];
dataDst[21 * ISTRIDE + coter] = -temp2[19] * 2 - temp2[20] + temp2[21] * 2 + temp2[22];
dataDst[22 * ISTRIDE + coter] = temp2[19] * 2 - temp2[20] - temp2[21] * 2 + temp2[22];
dataDst[23 * ISTRIDE + coter] = temp2[19] * 4 - temp2[21] * 5 + temp2[23];
dataDst[24 * ISTRIDE + coter] = temp2[24] * 4 - temp2[26] * 5 + temp2[28];
dataDst[25 * ISTRIDE + coter] = -temp2[25] * 4 - temp2[26] * 4 + temp2[27] + temp2[28];
dataDst[26 * ISTRIDE + coter] = temp2[25] * 4 - temp2[26] * 4 - temp2[27] + temp2[28];
dataDst[27 * ISTRIDE + coter] = -temp2[25] * 2 - temp2[26] + temp2[27] * 2 + temp2[28];
dataDst[28 * ISTRIDE + coter] = temp2[25] * 2 - temp2[26] - temp2[27] * 2 + temp2[28];
dataDst[29 * ISTRIDE + coter] = temp2[25] * 4 - temp2[27] * 5 + temp2[29];
dataDst[30 * ISTRIDE + coter] = temp2[30] * 4 - temp2[32] * 5 + temp2[34];
dataDst[31 * ISTRIDE + coter] = -temp2[31] * 4 - temp2[32] * 4 + temp2[33] + temp2[34];
dataDst[32 * ISTRIDE + coter] = temp2[31] * 4 - temp2[32] * 4 - temp2[33] + temp2[34];
dataDst[33 * ISTRIDE + coter] = -temp2[31] * 2 - temp2[32] + temp2[33] * 2 + temp2[34];
dataDst[34 * ISTRIDE + coter] = temp2[31] * 2 - temp2[32] - temp2[33] * 2 + temp2[34];
dataDst[35 * ISTRIDE + coter] = temp2[31] * 4 - temp2[33] * 5 + temp2[35];
(*counter)++;
}
static void filter_transform_4x3(const float* __restrict__ filter, const long C, const long K, float* __restrict__ out) {
long m, n, x;
const float *F;
const float r4 = 1.0 / 4;
const float r6 = 1.0 / 6;
const float r12 = 1.0 / 12;
const float r24 = 1.0 / 24;
#pragma omp parallel for collapse(2) private(m, n, x, F)
#pragma simd
for (m = 0; m < K; ++m) {
for (n = 0; n < C; ++n) {
float c1[18] __attribute__((aligned(64)));
F = filter + n * 3 * 3 + m * 3 * 3 * C;
c1[0] = r4 * F[0];
c1[1] = r4 * F[1];
c1[2] = r4 * F[2];
c1[3] = -r6 * (F[0] + F[3] + F[6]);
c1[4] = -r6 * (F[1] + F[4] + F[7]);
c1[5] = -r6 * (F[2] + F[5] + F[8]);
c1[6] = -r6 * (F[0] - F[3] + F[6]);
c1[7] = -r6 * (F[1] - F[4] + F[7]);
c1[8] = -r6 * (F[2] - F[5] + F[8]);
c1[9] = r24 * F[0] + r12 * F[3] + r6 * F[6];
c1[10] = r24 * F[1] + r12 * F[4] + r6 * F[7];
c1[11] = r24 * F[2] + r12 * F[5] + r6 * F[8];
c1[12] = r24 * F[0] - r12 * F[3] + r6 * F[6];
c1[13] = r24 * F[1] - r12 * F[4] + r6 * F[7];
c1[14] = r24 * F[2] - r12 * F[5] + r6 * F[8];
c1[15] = F[6];
c1[16] = F[7];
c1[17] = F[8];
float c2[36] __attribute__((aligned(64)));
c2[0] = r4 * c1[0];
c2[1] = -r6 * (c1[0] + c1[1] + c1[2]);
c2[2] = -r6 * (c1[0] - c1[1] + c1[2]);
c2[3] = r24 * c1[0] + r12 * c1[1] + r6 * c1[2];
c2[4] = r24 * c1[0] - r12 * c1[1] + r6 * c1[2];
c2[5] = c1[2];
c2[6] = r4 * c1[3];
c2[7] = -r6 * (c1[3] + c1[4] + c1[5]);
c2[8] = -r6 * (c1[3] - c1[4] + c1[5]);
c2[9] = r24 * c1[3] + r12 * c1[4] + r6 * c1[5];
c2[10] = r24 * c1[3] - r12 * c1[4] + r6 * c1[5];
c2[11] = c1[5];
c2[12] = r4 * c1[6];
c2[13] = -r6 * (c1[6] + c1[7] + c1[8]);
c2[14] = -r6 * (c1[6] - c1[7] + c1[8]);
c2[15] = r24 * c1[6] + r12 * c1[7] + r6 * c1[8];
c2[16] = r24 * c1[6] - r12 * c1[7] + r6 * c1[8];
c2[17] = c1[8];
c2[18] = r4 * c1[9];
c2[19] = -r6 * (c1[9] + c1[10] + c1[11]);
c2[20] = -r6 * (c1[9] - c1[10] + c1[11]);
c2[21] = r24 * c1[9] + r12 * c1[10] + r6 * c1[11];
c2[22] = r24 * c1[9] - r12 * c1[10] + r6 * c1[11];
c2[23] = c1[11];
c2[24] = r4 * c1[12];
c2[25] = -r6 * (c1[12] + c1[13] + c1[14]);
c2[26] = -r6 * (c1[12] - c1[13] + c1[14]);
c2[27] = r24 * c1[12] + r12 * c1[13] + r6 * c1[14];
c2[28] = r24 * c1[12] - r12 * c1[13] + r6 * c1[14];
c2[29] = c1[14];
c2[30] = r4 * c1[15];
c2[31] = -r6 * (c1[15] + c1[16] + c1[17]);
c2[32] = -r6 * (c1[15] - c1[16] + c1[17]);
c2[33] = r24 * c1[15] + r12 * c1[16] + r6 * c1[17];
c2[34] = r24 * c1[15] - r12 * c1[16] + r6 * c1[17];
c2[35] = c1[17];
#pragma unroll(9)
for (x = 0; x < 36; ++x) {
out[x * FSTRIDE + m * C + n] = c2[x];
}
}
}
}
static void out_transform_4x3_16t(long x, long y, long nrows,
const float* dataSrc, float* dataDst,
long *counter) {
long coter = *counter;
float c1[384] __attribute__((aligned(64)));
__m512 bufA[36], bufB, bufC, bufD, bufE, bufF, bufG, bufH, bufI;
__m512 bufTemp[24];
__m512i idx0 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4,
19, 3, 18, 2, 17, 1, 16, 0);
__m512i idx1 = _mm512_set_epi32(31, 15, 30, 14, 29, 13, 28, 12,
27, 11, 26, 10, 25, 9, 24, 8);
/* 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
1 5 9 13 17 21 25 29 33 37 41 45 49 53 57 61
2 6 10 14 18 22 26 30 34 38 42 46 50 54 58 62
3 7 11 15 19 23 27 31 35 39 43 47 51 55 59 63 */
/* 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30
1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31
32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62
33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63 */
/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 */
bufA[0] = _mm512_loadu_ps(dataSrc + 0 * OSTRIDE + coter);
bufA[1] = _mm512_loadu_ps(dataSrc + 1 * OSTRIDE + coter);
bufA[2] = _mm512_loadu_ps(dataSrc + 2 * OSTRIDE + coter);
bufA[3] = _mm512_loadu_ps(dataSrc + 3 * OSTRIDE + coter);
bufA[4] = _mm512_loadu_ps(dataSrc + 4 * OSTRIDE + coter);
bufA[5] = _mm512_loadu_ps(dataSrc + 5 * OSTRIDE + coter);
bufA[6] = _mm512_loadu_ps(dataSrc + 6 * OSTRIDE + coter);
bufA[7] = _mm512_loadu_ps(dataSrc + 7 * OSTRIDE + coter);
bufA[8] = _mm512_loadu_ps(dataSrc + 8 * OSTRIDE + coter);
bufA[9] = _mm512_loadu_ps(dataSrc + 9 * OSTRIDE + coter);
bufA[10] = _mm512_loadu_ps(dataSrc + 10 * OSTRIDE + coter);
bufA[11] = _mm512_loadu_ps(dataSrc + 11 * OSTRIDE + coter);
bufA[12] = _mm512_loadu_ps(dataSrc + 12 * OSTRIDE + coter);
bufA[13] = _mm512_loadu_ps(dataSrc + 13 * OSTRIDE + coter);
bufA[14] = _mm512_loadu_ps(dataSrc + 14 * OSTRIDE + coter);
bufA[15] = _mm512_loadu_ps(dataSrc + 15 * OSTRIDE + coter);
bufA[16] = _mm512_loadu_ps(dataSrc + 16 * OSTRIDE + coter);
bufA[17] = _mm512_loadu_ps(dataSrc + 17 * OSTRIDE + coter);
bufA[18] = _mm512_loadu_ps(dataSrc + 18 * OSTRIDE + coter);
bufA[19] = _mm512_loadu_ps(dataSrc + 19 * OSTRIDE + coter);
bufA[20] = _mm512_loadu_ps(dataSrc + 20 * OSTRIDE + coter);
bufA[21] = _mm512_loadu_ps(dataSrc + 21 * OSTRIDE + coter);
bufA[22] = _mm512_loadu_ps(dataSrc + 22 * OSTRIDE + coter);
bufA[23] = _mm512_loadu_ps(dataSrc + 23 * OSTRIDE + coter);
bufA[24] = _mm512_loadu_ps(dataSrc + 24 * OSTRIDE + coter);
bufA[25] = _mm512_loadu_ps(dataSrc + 25 * OSTRIDE + coter);
bufA[26] = _mm512_loadu_ps(dataSrc + 26 * OSTRIDE + coter);
bufA[27] = _mm512_loadu_ps(dataSrc + 27 * OSTRIDE + coter);
bufA[28] = _mm512_loadu_ps(dataSrc + 28 * OSTRIDE + coter);
bufA[29] = _mm512_loadu_ps(dataSrc + 29 * OSTRIDE + coter);
bufA[30] = _mm512_loadu_ps(dataSrc + 30 * OSTRIDE + coter);
bufA[31] = _mm512_loadu_ps(dataSrc + 31 * OSTRIDE + coter);
bufA[32] = _mm512_loadu_ps(dataSrc + 32 * OSTRIDE + coter);
bufA[33] = _mm512_loadu_ps(dataSrc + 33 * OSTRIDE + coter);
bufA[34] = _mm512_loadu_ps(dataSrc + 34 * OSTRIDE + coter);
bufA[35] = _mm512_loadu_ps(dataSrc + 35 * OSTRIDE + coter);
__m512 m2 = _mm512_set1_ps(2);
__m512 m4 = _mm512_set1_ps(4);
__m512 m8 = _mm512_set1_ps(8);
bufTemp[0] = _mm512_add_ps(bufA[0], bufA[6]);
bufTemp[1] = _mm512_add_ps(bufA[1], bufA[7]);
bufTemp[2] = _mm512_add_ps(bufA[2], bufA[8]);
bufTemp[3] = _mm512_add_ps(bufA[3], bufA[9]);
bufTemp[4] = _mm512_add_ps(bufA[4], bufA[10]);
bufTemp[5] = _mm512_add_ps(bufA[5], bufA[11]);
bufTemp[0] = _mm512_add_ps(bufTemp[0], bufA[12]);
bufTemp[1] = _mm512_add_ps(bufTemp[1], bufA[13]);
bufTemp[2] = _mm512_add_ps(bufTemp[2], bufA[14]);
bufTemp[3] = _mm512_add_ps(bufTemp[3], bufA[15]);
bufTemp[4] = _mm512_add_ps(bufTemp[4], bufA[16]);
bufTemp[5] = _mm512_add_ps(bufTemp[5], bufA[17]);
bufTemp[0] = _mm512_add_ps(bufTemp[0], bufA[18]);
bufTemp[1] = _mm512_add_ps(bufTemp[1], bufA[19]);
bufTemp[2] = _mm512_add_ps(bufTemp[2], bufA[20]);
bufTemp[3] = _mm512_add_ps(bufTemp[3], bufA[21]);
bufTemp[4] = _mm512_add_ps(bufTemp[4], bufA[22]);
bufTemp[5] = _mm512_add_ps(bufTemp[5], bufA[23]);
bufTemp[0] = _mm512_add_ps(bufTemp[0], bufA[24]);
bufTemp[1] = _mm512_add_ps(bufTemp[1], bufA[25]);
bufTemp[2] = _mm512_add_ps(bufTemp[2], bufA[26]);
bufTemp[3] = _mm512_add_ps(bufTemp[3], bufA[27]);
bufTemp[4] = _mm512_add_ps(bufTemp[4], bufA[28]);
bufTemp[5] = _mm512_add_ps(bufTemp[5], bufA[29]);
bufTemp[6] = _mm512_sub_ps(bufA[6], bufA[12]);
bufTemp[7] = _mm512_sub_ps(bufA[7], bufA[13]);
bufTemp[8] = _mm512_sub_ps(bufA[8], bufA[14]);
bufTemp[9] = _mm512_sub_ps(bufA[9], bufA[15]);
bufTemp[10] = _mm512_sub_ps(bufA[10], bufA[16]);
bufTemp[11] = _mm512_sub_ps(bufA[11], bufA[17]);
bufTemp[6] = _mm512_fmadd_ps(bufA[18], m2, bufTemp[6]);
bufTemp[7] = _mm512_fmadd_ps(bufA[19], m2, bufTemp[7]);
bufTemp[8] = _mm512_fmadd_ps(bufA[20], m2, bufTemp[8]);
bufTemp[9] = _mm512_fmadd_ps(bufA[21], m2, bufTemp[9]);
bufTemp[10] = _mm512_fmadd_ps(bufA[22], m2, bufTemp[10]);
bufTemp[11] = _mm512_fmadd_ps(bufA[23], m2, bufTemp[11]);
bufTemp[6] = _mm512_fnmadd_ps(bufA[24], m2, bufTemp[6]);
bufTemp[7] = _mm512_fnmadd_ps(bufA[25], m2, bufTemp[7]);
bufTemp[8] = _mm512_fnmadd_ps(bufA[26], m2, bufTemp[8]);
bufTemp[9] = _mm512_fnmadd_ps(bufA[27], m2, bufTemp[9]);
bufTemp[10] = _mm512_fnmadd_ps(bufA[28], m2, bufTemp[10]);
bufTemp[11] = _mm512_fnmadd_ps(bufA[29], m2, bufTemp[11]);
bufTemp[12] = _mm512_add_ps(bufA[6], bufA[12]);
bufTemp[13] = _mm512_add_ps(bufA[7], bufA[13]);
bufTemp[14] = _mm512_add_ps(bufA[8], bufA[14]);
bufTemp[15] = _mm512_add_ps(bufA[9], bufA[15]);
bufTemp[16] = _mm512_add_ps(bufA[10], bufA[16]);
bufTemp[17] = _mm512_add_ps(bufA[11], bufA[17]);
bufTemp[12] = _mm512_fmadd_ps(m4, bufA[18], bufTemp[12]);
bufTemp[13] = _mm512_fmadd_ps(m4, bufA[19], bufTemp[13]);
bufTemp[14] = _mm512_fmadd_ps(m4, bufA[20], bufTemp[14]);
bufTemp[15] = _mm512_fmadd_ps(m4, bufA[21], bufTemp[15]);
bufTemp[16] = _mm512_fmadd_ps(m4, bufA[22], bufTemp[16]);
bufTemp[17] = _mm512_fmadd_ps(m4, bufA[23], bufTemp[17]);
bufTemp[12] = _mm512_fmadd_ps(m4, bufA[24], bufTemp[12]);
bufTemp[13] = _mm512_fmadd_ps(m4, bufA[25], bufTemp[13]);
bufTemp[14] = _mm512_fmadd_ps(m4, bufA[26], bufTemp[14]);
bufTemp[15] = _mm512_fmadd_ps(m4, bufA[27], bufTemp[15]);
bufTemp[16] = _mm512_fmadd_ps(m4, bufA[28], bufTemp[16]);
bufTemp[17] = _mm512_fmadd_ps(m4, bufA[29], bufTemp[17]);
bufTemp[18] = _mm512_sub_ps(bufA[6], bufA[12]);
bufTemp[19] = _mm512_sub_ps(bufA[7], bufA[13]);
bufTemp[20] = _mm512_sub_ps(bufA[8], bufA[14]);
bufTemp[21] = _mm512_sub_ps(bufA[9], bufA[15]);
bufTemp[22] = _mm512_sub_ps(bufA[10], bufA[16]);
bufTemp[23] = _mm512_sub_ps(bufA[11], bufA[17]);
bufTemp[18] = _mm512_fmadd_ps(m8, bufA[18], bufTemp[18]);
bufTemp[19] = _mm512_fmadd_ps(m8, bufA[19], bufTemp[19]);
bufTemp[20] = _mm512_fmadd_ps(m8, bufA[20], bufTemp[20]);
bufTemp[21] = _mm512_fmadd_ps(m8, bufA[21], bufTemp[21]);
bufTemp[22] = _mm512_fmadd_ps(m8, bufA[22], bufTemp[22]);
bufTemp[23] = _mm512_fmadd_ps(m8, bufA[23], bufTemp[23]);
bufTemp[18] = _mm512_fnmadd_ps(m8, bufA[24], bufTemp[18]);
bufTemp[19] = _mm512_fnmadd_ps(m8, bufA[25], bufTemp[19]);
bufTemp[20] = _mm512_fnmadd_ps(m8, bufA[26], bufTemp[20]);
bufTemp[21] = _mm512_fnmadd_ps(m8, bufA[27], bufTemp[21]);
bufTemp[22] = _mm512_fnmadd_ps(m8, bufA[28], bufTemp[22]);
bufTemp[23] = _mm512_fnmadd_ps(m8, bufA[29], bufTemp[23]);
bufTemp[18] = _mm512_add_ps(bufA[30], bufTemp[18]);
bufTemp[19] = _mm512_add_ps(bufA[31], bufTemp[19]);
bufTemp[20] = _mm512_add_ps(bufA[32], bufTemp[20]);
bufTemp[21] = _mm512_add_ps(bufA[33], bufTemp[21]);
bufTemp[22] = _mm512_add_ps(bufA[34], bufTemp[22]);
bufTemp[23] = _mm512_add_ps(bufA[35], bufTemp[23]);
bufB = _mm512_add_ps(bufTemp[0], bufTemp[1]);
bufB = _mm512_add_ps(bufB, bufTemp[2]);
bufB = _mm512_add_ps(bufB, bufTemp[3]);
bufB = _mm512_add_ps(bufB, bufTemp[4]);
bufC = _mm512_sub_ps(bufTemp[1], bufTemp[2]);
bufC = _mm512_fmadd_ps(m2, bufTemp[3], bufC);
bufC = _mm512_fnmadd_ps(m2, bufTemp[4], bufC);
bufD = _mm512_add_ps(bufTemp[1], bufTemp[2]);
bufD = _mm512_fmadd_ps(m4, bufTemp[3], bufD);
bufD = _mm512_fmadd_ps(m4, bufTemp[4], bufD);
bufE = _mm512_sub_ps(bufTemp[1], bufTemp[2]);
bufE = _mm512_fmadd_ps(m8, bufTemp[3], bufE);
bufE = _mm512_fnmadd_ps(m8, bufTemp[4], bufE);
bufE = _mm512_add_ps(bufTemp[5], bufE);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufD);
bufG = _mm512_permutex2var_ps(bufC, idx0, bufE);
bufH = _mm512_permutex2var_ps(bufB, idx1, bufD);
bufI = _mm512_permutex2var_ps(bufC, idx1, bufE);
bufB = _mm512_permutex2var_ps(bufF, idx0, bufG);
bufC = _mm512_permutex2var_ps(bufF, idx1, bufG);
bufD = _mm512_permutex2var_ps(bufH, idx0, bufI);
bufE = _mm512_permutex2var_ps(bufH, idx1, bufI);
_mm512_storeu_ps(dataDst + (x + 0) * nrows + y + 0, bufB);
_mm512_storeu_ps(dataDst + (x + 0) * nrows + y + 16, bufC);
_mm512_storeu_ps(dataDst + (x + 0) * nrows + y + 32, bufD);
_mm512_storeu_ps(dataDst + (x + 0) * nrows + y + 48, bufE);
bufB = _mm512_add_ps(bufTemp[6], bufTemp[7]);
bufB = _mm512_add_ps(bufB, bufTemp[8]);
bufB = _mm512_add_ps(bufB, bufTemp[9]);
bufB = _mm512_add_ps(bufB, bufTemp[10]);
bufC = _mm512_sub_ps(bufTemp[7], bufTemp[8]);
bufC = _mm512_fmadd_ps(m2, bufTemp[9], bufC);
bufC = _mm512_fnmadd_ps(m2, bufTemp[10], bufC);
bufD = _mm512_add_ps(bufTemp[7], bufTemp[8]);
bufD = _mm512_fmadd_ps(m4, bufTemp[9], bufD);
bufD = _mm512_fmadd_ps(m4, bufTemp[10], bufD);
bufE = _mm512_sub_ps(bufTemp[7], bufTemp[8]);
bufE = _mm512_fmadd_ps(m8, bufTemp[9], bufE);
bufE = _mm512_fnmadd_ps(m8, bufTemp[10], bufE);
bufE = _mm512_add_ps(bufTemp[11], bufE);
bufF = _mm512_permutex2var_ps(bufB, idx0, bufD);
bufG = _mm512_permutex2var_ps(bufC, idx0, bufE);
bufH = _mm512_permutex2var_ps(bufB, idx1, bufD);
bufI = _mm512_permutex2var_ps(bufC, idx1, bufE);
bufB = _mm512_permutex2var_ps(bufF, idx0, bufG);
bufC = _mm512_permutex2var_ps(bufF, idx1, bufG);
bufD = _mm512_permutex2var_ps(bufH, idx0, bufI);
bufE = _mm512_permutex2var_ps(bufH, idx1, bufI);
_mm512_storeu_ps(dataDst + (x + 1) * nrows + y + 0, bufB);
_mm512_storeu_ps(dataDst + (x + 1) * nrows + y + 16, bufC);
_mm512_storeu_ps(dataDst + (x + 1) * nrows + y + 32, bufD);
_mm512_storeu_ps(dataDst + (x + 1) * nrows + y + 48, bufE);
bufB = _mm512_add_ps(bufTemp[12], bufTemp[13]);
bufB = _mm512_add_ps(bufB, bufTemp[14]);
bufB = _mm512_add_ps(bufB, bufTemp[15]);
bufB = _mm512_add_ps(bufB, bufTemp[16]);
bufC = _mm512_sub_ps(bufTemp[13], bufTemp[14]);
bufC = _mm512_fmadd_ps(m2, bufTemp[15], bufC);
bufC = _mm512_fnmadd_ps(m2, bufTemp[16], bufC);
bufD = _mm512_add_ps(bufTemp[13], bufTemp[14]);
bufD = _mm512_fmadd_ps(m4, bufTemp[15], bufD);
bufD = _mm512_fmadd_ps(m4, bufTemp[16], bufD);
bufE = _mm512_sub_ps(bufTemp[13], bufTemp[14]);
bufE = _mm512_fmadd_ps(m8, bufTemp[15], bufE);
bufE = _mm512_fnmadd_ps(m8, bufTemp[16], bufE);