From 8dbecded455fbeecb669cd4b69b1139452396e68 Mon Sep 17 00:00:00 2001 From: Krishnam Tibrewala Date: Tue, 25 Mar 2025 17:49:31 -0500 Subject: [PATCH] [AIE2P] Use OR to mimic MOV when copying GPR to GPR --- llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp | 21 +++++--- llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h | 1 + llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.td | 11 ++++ .../aie2p/AIE2PMultiSlotPseudoInstrInfo.td | 7 +++ llvm/test/CodeGen/AIE/aie2p/cascade-stream.ll | 8 +-- .../AIE/aie2p/insert-element-64bits.ll | 8 +-- .../AIE/aie2p/postrapseudos/pseudomove.mir | 52 ++++++++++++++++++- .../CodeGen/AIE/aie2p/run-physreg-copy.mir | 4 +- llvm/test/CodeGen/AIE/aie2p/streams.ll | 5 +- llvm/test/CodeGen/AIE/aie2p/vscl2vec.ll | 30 +++++------ 10 files changed, 112 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp index dc901fd9462a..f6b3b76808a3 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.cpp @@ -557,11 +557,8 @@ void AIE2PInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (AIE2P::mMvSclSrcRegClass.contains(SrcReg) && AIE2P::mMvSclDstRegClass.contains(DstReg)) { // Build MultiSlotPseudo in preference - unsigned Opcode = (AIE2P::mAguSrcRegClass.contains(SrcReg) && - AIE2P::mAguDstRegClass.contains(DstReg)) - ? AIE2P::MOV_scalar_pseudo - : AIE2P::MOV_alu_mv_mv_mv_scl; - BuildMI(MBB, MBBI, DL, get(Opcode), DstReg) + const unsigned MOVSclOpcode = getScalarMovOpcode(DstReg, SrcReg); + BuildMI(MBB, MBBI, DL, get(MOVSclOpcode), DstReg) .addReg(SrcReg, getKillRegState(KillSrc)); } else if ((AIE2P::eLRegClass.contains(SrcReg)) && (AIE2P::eLRegClass.contains(DstReg))) { @@ -1179,6 +1176,17 @@ unsigned AIE2PInstrInfo::getConstantMovOpcode(MachineRegisterInfo &MRI, llvm_unreachable("Expected imm. size <= 32 bits"); } +unsigned AIE2PInstrInfo::getScalarMovOpcode(Register DstReg, + Register SrcReg) const { + return (AIE2P::eRRegClass.contains(SrcReg) && + AIE2P::eRRegClass.contains(DstReg)) + ? AIE2P::MOV_OR_pseudo + : (AIE2P::mAguSrcRegClass.contains(SrcReg) && + AIE2P::mAguDstRegClass.contains(DstReg)) + ? AIE2P::MOV_scalar_pseudo + : AIE2P::MOV_alu_mv_mv_mv_scl; +} + unsigned AIE2PInstrInfo::getCycleSeparatorOpcode() const { return AIE2P::CYCLE_SEPARATOR; } @@ -1192,7 +1200,8 @@ bool AIE2PInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AIE2P::PseudoMove: { Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); - BuildMI(MBB, MI, DL, get(AIE2P::MOV_alu_mv_mv_mv_scl), Dst) + const unsigned MOVSclOpcode = getScalarMovOpcode(Dst, Src); + BuildMI(MBB, MI, DL, get(MOVSclOpcode), Dst) .addReg(Src, getKillRegState(MI.getOperand(1).isKill())); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h index 188c16b65045..8b89194796f8 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.h @@ -45,6 +45,7 @@ class AIE2PInstrInfo : public AIE2PGenInstrInfo { unsigned getPseudoMoveOpcode() const override; unsigned getConstantMovOpcode(MachineRegisterInfo &MRI, unsigned int Reg, APInt &Val) const override; + unsigned getScalarMovOpcode(Register DstReg, Register SrcReg) const override; unsigned getMvScl2MS(unsigned ConstTLastVal) const override; unsigned getMvNBScl2MS(unsigned ConstTLastVal) const override; unsigned getMvScl2MSTlastRegOpcode() const override; diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.td b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.td index 00467f412c5e..ffe2ce505bb1 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.td +++ b/llvm/lib/Target/AIE/aie2p/AIE2PInstrInfo.td @@ -130,6 +130,17 @@ let hasDelaySlot = true, isBranch = true, isTerminator = true, } } +// Modified OR instruction to mimic MOV operation +// OR $dst, $src, $src is equivalent to MOV $dst, $src +let Itinerary = II_OR, isCodeGenOnly = 1, hasSideEffects = false, + mayLoad = 0, mayStore = 0 in { + def MOV_OR : AIE2P_inst_alu_instr32 <(outs eR:$d0), (ins eR:$s0), "or", " $d0, $s0, $s0">{ + bits<5> s0; + bits<5> d0; + let alu = {s0, d0, s0, 0b0101, 0b1}; + } +} + include "aie2p/AIE2PMultiSlotPseudoInstrInfo.td" // Define _split variants for instructions using 2D registers class Split2DInstr : SplitPseudo; } +// We use OR to mimic MOV behavior, and not ADD with 0 because ADD uses $srCarry register +let Itinerary = II_MOV_alu_mv_mv_mv_scl, isMoveReg = 1, hasSideEffects = false, mayLoad = false, mayStore = false in { + def MOV_OR_pseudo : MultiSlot_Pseudo<(outs eR:$mRx), (ins eR:$mRx0), + "mov_scl_pseudo", "$mRx, $mRx0", + [MOV_alu_mv_mv_mv_scl, MOV_OR]>; +} + // Pseudo VLD let hasSideEffects = false, mayLoad = true, mayStore = false in { // Fifo fill. diff --git a/llvm/test/CodeGen/AIE/aie2p/cascade-stream.ll b/llvm/test/CodeGen/AIE/aie2p/cascade-stream.ll index 178878956964..ce8861059c59 100644 --- a/llvm/test/CodeGen/AIE/aie2p/cascade-stream.ll +++ b/llvm/test/CodeGen/AIE/aie2p/cascade-stream.ll @@ -150,8 +150,8 @@ define dso_local inreg noundef <32 x i32> @_Z28test_get_scd_expand_v32acc32ii(i3 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: vmov dm0, scd, r31; nopb ; nopxm ; CHECK-NEXT: ret lr -; CHECK-NEXT: mov r31, r1 // Delay Slot 5 -; CHECK-NEXT: mov crscden, r0 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: or r31, r1, r1; mov crscden, r0 // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: mov crscden, #1 // Delay Slot 1 @@ -167,8 +167,8 @@ define dso_local inreg noundef <64 x i32> @_Z28test_get_scd_expand_v64acc32ii(i3 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: vmov dm0, scd, r31; nopb ; nopxm ; CHECK-NEXT: ret lr -; CHECK-NEXT: mov r31, r1 // Delay Slot 5 -; CHECK-NEXT: mov crscden, r0 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: or r31, r1, r1; mov crscden, r0 // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: mov crscden, #1 // Delay Slot 1 diff --git a/llvm/test/CodeGen/AIE/aie2p/insert-element-64bits.ll b/llvm/test/CodeGen/AIE/aie2p/insert-element-64bits.ll index 39d4299d5368..549d2e788c9b 100644 --- a/llvm/test/CodeGen/AIE/aie2p/insert-element-64bits.ll +++ b/llvm/test/CodeGen/AIE/aie2p/insert-element-64bits.ll @@ -14,11 +14,11 @@ define dso_local noundef <64 x i8> @insert_element_64(<8 x i64> noundef %v, i32 ; CHECK-NEXT: vlda bmll0, [sp, #-64]; nopb ; nops ; nopxm ; nopv ; CHECK-NEXT: nopx ; CHECK-NEXT: nop -; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: mov r29, r0 // Delay Slot 5 -; CHECK-NEXT: mov r5, r2 // Delay Slot 4 -; CHECK-NEXT: vmov x0, bmll0 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: or r4, r1, r1; mov r29, r0 // Delay Slot 4 +; CHECK-NEXT: or r5, r2, r2; vmov x0, bmll0 // Delay Slot 3 ; CHECK-NEXT: vinsert.64 x0, x0, r29, r5:r4 // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: diff --git a/llvm/test/CodeGen/AIE/aie2p/postrapseudos/pseudomove.mir b/llvm/test/CodeGen/AIE/aie2p/postrapseudos/pseudomove.mir index a2658eddec4f..33f918daf3ae 100644 --- a/llvm/test/CodeGen/AIE/aie2p/postrapseudos/pseudomove.mir +++ b/llvm/test/CodeGen/AIE/aie2p/postrapseudos/pseudomove.mir @@ -13,6 +13,56 @@ alignment: 16 body: | bb.0 (align 16): ; CHECK-LABEL: name: test_pseudomove - ; CHECK: $r0 = MOV_alu_mv_mv_mv_scl killed $r8 + ; CHECK: $r0 = MOV_OR_pseudo killed $r8 $r0 = PseudoMove killed $r8 ... + +--- +name: pseudoMov_non_GPR +alignment: 16 +body: | + bb.0 (align 16): + ; CHECK-LABEL: name: pseudoMov_non_GPR + ; CHECK: $r1 = MOV_alu_mv_mv_mv_scl $p0 + ; CHECK-NEXT: $p0 = MOV_scalar_pseudo $r1 + ; CHECK-NEXT: $p1 = MOV_scalar_pseudo $p0 + ; CHECK-NEXT: $r1 = MOV_alu_mv_mv_mv_scl $s0 + ; CHECK-NEXT: $s0 = MOV_alu_mv_mv_mv_scl $r1 + ; CHECK-NEXT: $s1 = MOV_alu_mv_mv_mv_scl $s0 + $r1 = PseudoMove $p0 + $p0 = PseudoMove $r1 + $p1 = PseudoMove $p0 + $r1 = PseudoMove $s0 + $s0 = PseudoMove $r1 + $s1 = PseudoMove $s0 +... + +--- +name: COPY_GPR +alignment: 16 +body: | + bb.0 (align 16): + ; CHECK-LABEL: name: COPY_GPR + ; CHECK: $r1 = MOV_OR_pseudo $r0 + $r1 = COPY $r0 +... + +--- +name: COPY_non_GPR +alignment: 16 +body: | + bb.0 (align 16): + ; CHECK-LABEL: name: COPY_non_GPR + ; CHECK: $r1 = MOV_alu_mv_mv_mv_scl $p0 + ; CHECK-NEXT: $p0 = MOV_scalar_pseudo $r1 + ; CHECK-NEXT: $p1 = MOV_scalar_pseudo $p0 + ; CHECK-NEXT: $r1 = MOV_alu_mv_mv_mv_scl $s0 + ; CHECK-NEXT: $s0 = MOV_alu_mv_mv_mv_scl $r1 + ; CHECK-NEXT: $s1 = MOV_alu_mv_mv_mv_scl $s0 + $r1 = COPY $p0 + $p0 = COPY $r1 + $p1 = COPY $p0 + $r1 = COPY $s0 + $s0 = COPY $r1 + $s1 = COPY $s0 +... diff --git a/llvm/test/CodeGen/AIE/aie2p/run-physreg-copy.mir b/llvm/test/CodeGen/AIE/aie2p/run-physreg-copy.mir index f4380ad3315b..aa1d8c523718 100644 --- a/llvm/test/CodeGen/AIE/aie2p/run-physreg-copy.mir +++ b/llvm/test/CodeGen/AIE/aie2p/run-physreg-copy.mir @@ -165,11 +165,11 @@ body: | ; CHECK: $p1 = MOV_scalar_pseudo $p0 ; CHECK-NEXT: $lfl1 = VMOV_alu_mv_mv_x $lfl0 ; CHECK-NEXT: $lfh1 = VMOV_alu_mv_mv_x $lfh0 - ; CHECK-NEXT: $r25 = MOV_alu_mv_mv_mv_scl $r24 + ; CHECK-NEXT: $r25 = MOV_OR_pseudo $r24 ; CHECK-NEXT: $p0 = MOV_scalar_pseudo $p1 ; CHECK-NEXT: $lfl0 = VMOV_alu_mv_mv_x $lfl1 ; CHECK-NEXT: $lfh0 = VMOV_alu_mv_mv_x $lfh1 - ; CHECK-NEXT: $r24 = MOV_alu_mv_mv_mv_scl $r25 + ; CHECK-NEXT: $r24 = MOV_OR_pseudo $r25 $plfr1 = COPY $plfr0 $plfr0 = COPY $plfr1 ... diff --git a/llvm/test/CodeGen/AIE/aie2p/streams.ll b/llvm/test/CodeGen/AIE/aie2p/streams.ll index 100f16ce3693..c92e55a3fd27 100644 --- a/llvm/test/CodeGen/AIE/aie2p/streams.ll +++ b/llvm/test/CodeGen/AIE/aie2p/streams.ll @@ -138,8 +138,7 @@ define dso_local void @_Z19test_put_ms_v64bf16Dv64_u6__bf16ii(<64 x bfloat> noun ; CHECK-LABEL: _Z19test_put_ms_v64bf16Dv64_u6__bf16ii: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopx ; mov r28, r1 -; CHECK-NEXT: vextract.32 r0, x4, #0, vaddsign1 +; CHECK-NEXT: nopa ; nopb ; nopx ; vextract.32 r0, x4, #0, vaddsign1 ; CHECK-NEXT: vextract.32 r2, x4, #1, vaddsign1 ; CHECK-NEXT: mov ms, r0; vextract.32 r0, x4, #2, vaddsign1 ; CHECK-NEXT: mov ms, r2; vextract.32 r2, x4, #3, vaddsign1 @@ -171,7 +170,7 @@ define dso_local void @_Z19test_put_ms_v64bf16Dv64_u6__bf16ii(<64 x bfloat> noun ; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #13, vaddsign1 ; CHECK-NEXT: mov ms, r0; ret lr; vextract.32 r0, x5, #14, vaddsign1 ; CHECK-NEXT: mov ms, r2; vextract.32 r2, x5, #15, vaddsign1 // Delay Slot 5 -; CHECK-NEXT: mov ms, r0 // Delay Slot 4 +; CHECK-NEXT: mov ms, r0; or r28, r1, r1 // Delay Slot 4 ; CHECK-NEXT: mov ms, r2, r28 // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 diff --git a/llvm/test/CodeGen/AIE/aie2p/vscl2vec.ll b/llvm/test/CodeGen/AIE/aie2p/vscl2vec.ll index bd6c7ce1ced3..2d48b739cc6e 100644 --- a/llvm/test/CodeGen/AIE/aie2p/vscl2vec.ll +++ b/llvm/test/CodeGen/AIE/aie2p/vscl2vec.ll @@ -629,9 +629,9 @@ define dso_local noundef <32 x bfloat> @_Z13test_upd_elemDv32_u6__bf16iy(<32 x b ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: nopa ; nopb ; ret lr; nopm ; nops -; CHECK-NEXT: mov r4, r1 // Delay Slot 5 -; CHECK-NEXT: mov r29, r0 // Delay Slot 4 -; CHECK-NEXT: mov r5, r2 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: mov r4, r1 // Delay Slot 4 +; CHECK-NEXT: or r29, r0, r0; mov r5, r2 // Delay Slot 3 ; CHECK-NEXT: vinsert.64 x0, x2, r29, r5:r4 // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 entry: @@ -729,8 +729,8 @@ define dso_local noundef <2 x float> @_Z16test_ext_v2floatDv16_fii(<16 x float> ; CHECK-NEXT: nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r10, r0 // Delay Slot 2 -; CHECK-NEXT: mov r1, r8 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r10, r0, r0; mov r1, r8 // Delay Slot 1 ; CHECK-NEXT: lda lr, [sp, #-52]; nopx // 4-byte Folded Reload ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -740,8 +740,8 @@ define dso_local noundef <2 x float> @_Z16test_ext_v2floatDv16_fii(<16 x float> ; CHECK-NEXT: lda r11, [sp, #-64] // 4-byte Folded Reload ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mov r11, r0 // Delay Slot 4 -; CHECK-NEXT: mov r0, r10 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: or r11, r0, r0; mov r0, r10 // Delay Slot 3 ; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2 ; CHECK-NEXT: mov r1, r11 // Delay Slot 1 entry: @@ -803,8 +803,8 @@ define dso_local noundef <2 x float> @_Z20test_extract_v2floatDv16_fii(<16 x flo ; CHECK-NEXT: nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r10, r0 // Delay Slot 2 -; CHECK-NEXT: mov r1, r8 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r10, r0, r0; mov r1, r8 // Delay Slot 1 ; CHECK-NEXT: lda lr, [sp, #-52]; nopx // 4-byte Folded Reload ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -814,8 +814,8 @@ define dso_local noundef <2 x float> @_Z20test_extract_v2floatDv16_fii(<16 x flo ; CHECK-NEXT: lda r11, [sp, #-64] // 4-byte Folded Reload ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mov r11, r0 // Delay Slot 4 -; CHECK-NEXT: mov r0, r10 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: or r11, r0, r0; mov r0, r10 // Delay Slot 3 ; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2 ; CHECK-NEXT: mov r1, r11 // Delay Slot 1 entry: @@ -846,8 +846,8 @@ define dso_local noundef <2 x float> @_Z20test_extract_v2floatDv16_fi(<16 x floa ; CHECK-NEXT: nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: mov r10, r0 // Delay Slot 2 -; CHECK-NEXT: mov r1, r8 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: or r10, r0, r0; mov r1, r8 // Delay Slot 1 ; CHECK-NEXT: lda lr, [sp, #-52]; nopx // 4-byte Folded Reload ; CHECK-NEXT: nop ; CHECK-NEXT: nop @@ -857,8 +857,8 @@ define dso_local noundef <2 x float> @_Z20test_extract_v2floatDv16_fi(<16 x floa ; CHECK-NEXT: lda r11, [sp, #-64] // 4-byte Folded Reload ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mov r11, r0 // Delay Slot 4 -; CHECK-NEXT: mov r0, r10 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: or r11, r0, r0; mov r0, r10 // Delay Slot 3 ; CHECK-NEXT: paddxm [sp], #-64 // Delay Slot 2 ; CHECK-NEXT: mov r1, r11 // Delay Slot 1 entry: