Skip to content

Commit

Permalink
Arm: Add NEON and MVE complex mul, mla and mls patterns.
Browse files Browse the repository at this point in the history
This adds implementation for the optabs for complex operations.  With this the
following C code:

  void g (float complex a[restrict N], float complex b[restrict N],
	  float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] =  a[i] * b[i];
  }

generates

NEON:

g:
        vmov.f32        q11, #0.0  @ v4sf
        add     r3, r2, #1600
.L2:
        vmov    q8, q11  @ v4sf
        vld1.32 {q10}, [r1]!
        vld1.32 {q9}, [r0]!
        vcmla.f32       q8, q9, q10, #0
        vcmla.f32       q8, q9, q10, #90
        vst1.32 {q8}, [r2]!
        cmp     r3, r2
        bne     .L2
        bx      lr

MVE:

g:
        push    {lr}
        mov     lr, #100
        dls     lr, lr
.L2:
        vldrw.32        q1, [r1], #16
        vldrw.32        q2, [r0], #16
        vcmul.f32       q3, q2, q1, #0
        vcmla.f32       q3, q2, q1, #90
        vstrw.32        q3, [r2], #16
        le      lr, .L2
        ldr     pc, [sp], #4

instead of

g:
        add     r3, r2, #1600
.L2:
        vld2.32 {d20-d23}, [r0]!
        vld2.32 {d16-d19}, [r1]!
        vmul.f32        q14, q11, q9
        vmul.f32        q15, q11, q8
        vneg.f32        q14, q14
        vfma.f32        q15, q10, q9
        vfma.f32        q14, q10, q8
        vmov    q13, q15  @ v4sf
        vmov    q12, q14  @ v4sf
        vst2.32 {d24-d27}, [r2]!
        cmp     r3, r2
        bne     .L2
        bx      lr

and

g:
        add     r3, r2, #1600
.L2:
        vld2.32 {d20-d23}, [r0]!
        vld2.32 {d16-d19}, [r1]!
        vmul.f32        q15, q10, q8
        vmul.f32        q14, q10, q9
        vmls.f32        q15, q11, q9
        vmla.f32        q14, q11, q8
        vmov    q12, q15  @ v4sf
        vmov    q13, q14  @ v4sf
        vst2.32 {d24-d27}, [r2]!
        cmp     r3, r2
        bne     .L2
        bx      lr

respectively.

gcc/ChangeLog:

	* config/arm/iterators.md (rotsplit1, rotsplit2, conj_op, fcmac1,
	VCMLA_OP, VCMUL_OP): New.
	* config/arm/mve.md (mve_vcmlaq<mve_rot><mode>): Support vec_dup 0.
	* config/arm/neon.md (cmul<conj_op><mode>3): New.
	* config/arm/unspecs.md (UNSPEC_VCMLA_CONJ, UNSPEC_VCMLA180_CONJ,
	UNSPEC_VCMUL_CONJ): New.
	* config/arm/vec-common.md (cmul<conj_op><mode>3, arm_vcmla<rot><mode>,
	cml<fcmac1><conj_op><mode>4): New.
  • Loading branch information
TamarChristinaArm committed Jan 25, 2021
1 parent 02551aa commit 389b67f
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 6 deletions.
40 changes: 40 additions & 0 deletions gcc/config/arm/iterators.md
Expand Up @@ -1186,6 +1186,33 @@
(UNSPEC_VCMLA180 "180")
(UNSPEC_VCMLA270 "270")])

;; The complex operations when performed on a real complex number require two
;; instructions to perform the operation. e.g. complex multiplication requires
;; two VCMUL with a particular rotation value.
;;
;; These values can be looked up in rotsplit1 and rotsplit2. as an example
;; VCMUL needs the first instruction to use #0 and the second #90.
(define_int_attr rotsplit1 [(UNSPEC_VCMLA "0")
(UNSPEC_VCMLA_CONJ "0")
(UNSPEC_VCMUL "0")
(UNSPEC_VCMUL_CONJ "0")
(UNSPEC_VCMLA180 "180")
(UNSPEC_VCMLA180_CONJ "180")])

(define_int_attr rotsplit2 [(UNSPEC_VCMLA "90")
(UNSPEC_VCMLA_CONJ "270")
(UNSPEC_VCMUL "90")
(UNSPEC_VCMUL_CONJ "270")
(UNSPEC_VCMLA180 "270")
(UNSPEC_VCMLA180_CONJ "90")])

(define_int_attr conj_op [(UNSPEC_VCMLA180 "")
(UNSPEC_VCMLA180_CONJ "_conj")
(UNSPEC_VCMLA "")
(UNSPEC_VCMLA_CONJ "_conj")
(UNSPEC_VCMUL "")
(UNSPEC_VCMUL_CONJ "_conj")])

(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90")
(UNSPEC_VCADD270 "_rot270")
(UNSPEC_VCMLA "")
Expand All @@ -1200,6 +1227,9 @@
(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90
UNSPEC_VCMUL180 UNSPEC_VCMUL270])

(define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA_CONJ "a")
(UNSPEC_VCMLA180 "s") (UNSPEC_VCMLA180_CONJ "s")])

(define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") (UNSPEC_QSUB8 "qsub8")
(UNSPEC_SHADD8 "shadd8") (UNSPEC_SHSUB8 "shsub8")
(UNSPEC_UHADD8 "uhadd8") (UNSPEC_UHSUB8 "uhsub8")
Expand Down Expand Up @@ -1723,3 +1753,13 @@
(define_int_iterator UQRSHLLQ [UQRSHLL_64 UQRSHLL_48])
(define_int_iterator SQRSHRLQ [SQRSHRL_64 SQRSHRL_48])
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])

;; Define iterators for VCMLA operations
(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
UNSPEC_VCMLA_CONJ
UNSPEC_VCMLA180
UNSPEC_VCMLA180_CONJ])

;; Define iterators for VCMLA operations as MUL
(define_int_iterator VCMUL_OP [UNSPEC_VCMUL
UNSPEC_VCMUL_CONJ])
13 changes: 7 additions & 6 deletions gcc/config/arm/mve.md
Expand Up @@ -4101,15 +4101,16 @@
(define_insn "mve_vcmlaq<mve_rot><mode>"
[
(set (match_operand:MVE_0 0 "s_register_operand" "=w,w")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz")
(match_operand:MVE_0 2 "s_register_operand" "w,w")
(match_operand:MVE_0 3 "s_register_operand" "w,w")]
VCMLA))
(plus:MVE_0 (match_operand:MVE_0 1 "reg_or_zero_operand" "Dz,0")
(unspec:MVE_0
[(match_operand:MVE_0 2 "s_register_operand" "w,w")
(match_operand:MVE_0 3 "s_register_operand" "w,w")]
VCMLA)))
]
"TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
"@
vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>
vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>"
[(set_attr "type" "mve_move")
])

Expand Down
19 changes: 19 additions & 0 deletions gcc/config/arm/neon.md
Expand Up @@ -2952,6 +2952,25 @@
[(set_attr "type" "neon_fcmla")]
)

;; The complex mul operations always need to expand to two instructions.
;; The first operation does half the computation and the second does the
;; remainder. Because of this, expand early.
(define_expand "cmul<conj_op><mode>3"
[(set (match_operand:VDF 0 "register_operand")
(unspec:VDF [(match_operand:VDF 1 "register_operand")
(match_operand:VDF 2 "register_operand")]
VCMUL_OP))]
"TARGET_COMPLEX && !BYTES_BIG_ENDIAN"
{
rtx res1 = gen_reg_rtx (<MODE>mode);
rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
emit_insn (gen_neon_vcmla<rotsplit1><mode> (res1, tmp,
operands[2], operands[1]));
emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], res1,
operands[2], operands[1]));
DONE;
})


;; These instructions map to the __builtins for the Dot Product operations.
(define_insn "neon_<sup>dot<vsi2qi>"
Expand Down
3 changes: 3 additions & 0 deletions gcc/config/arm/unspecs.md
Expand Up @@ -510,10 +510,13 @@
UNSPEC_VCMLA90
UNSPEC_VCMLA180
UNSPEC_VCMLA270
UNSPEC_VCMLA_CONJ
UNSPEC_VCMLA180_CONJ
UNSPEC_VCMUL
UNSPEC_VCMUL90
UNSPEC_VCMUL180
UNSPEC_VCMUL270
UNSPEC_VCMUL_CONJ
UNSPEC_MATMUL_S
UNSPEC_MATMUL_U
UNSPEC_MATMUL_US
Expand Down
57 changes: 57 additions & 0 deletions gcc/config/arm/vec-common.md
Expand Up @@ -215,6 +215,63 @@
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
)

;; The complex mul operations always need to expand to two instructions.
;; The first operation does half the computation and the second does the
;; remainder. Because of this, expand early.
(define_expand "cmul<conj_op><mode>3"
[(set (match_operand:VQ_HSF 0 "register_operand")
(unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand")
(match_operand:VQ_HSF 2 "register_operand")]
VCMUL_OP))]
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT))
&& !BYTES_BIG_ENDIAN"
{
rtx res1 = gen_reg_rtx (<MODE>mode);
if (TARGET_COMPLEX)
{
rtx tmp = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, tmp,
operands[2], operands[1]));
}
else
emit_insn (gen_arm_vcmla<rotsplit1><mode> (res1, CONST0_RTX (<MODE>mode),
operands[2], operands[1]));

emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], res1,
operands[2], operands[1]));
DONE;
})

(define_expand "arm_vcmla<rot><mode>"
[(set (match_operand:VF 0 "register_operand")
(plus:VF (match_operand:VF 1 "register_operand")
(unspec:VF [(match_operand:VF 2 "register_operand")
(match_operand:VF 3 "register_operand")]
VCMLA)))]
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
)

;; The complex mla/mls operations always need to expand to two instructions.
;; The first operation does half the computation and the second does the
;; remainder. Because of this, expand early.
(define_expand "cml<fcmac1><conj_op><mode>4"
[(set (match_operand:VF 0 "register_operand")
(plus:VF (match_operand:VF 1 "register_operand")
(unspec:VF [(match_operand:VF 2 "register_operand")
(match_operand:VF 3 "register_operand")]
VCMLA_OP)))]
"(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT
&& ARM_HAVE_<MODE>_ARITH)) && !BYTES_BIG_ENDIAN"
{
rtx tmp = gen_reg_rtx (<MODE>mode);
emit_insn (gen_arm_vcmla<rotsplit1><mode> (tmp, operands[1],
operands[3], operands[2]));
emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], tmp,
operands[3], operands[2]));
DONE;
})

(define_expand "movmisalign<mode>"
[(set (match_operand:VDQX 0 "neon_perm_struct_or_reg_operand")
(unspec:VDQX [(match_operand:VDQX 1 "neon_perm_struct_or_reg_operand")]
Expand Down

0 comments on commit 389b67f

Please sign in to comment.