From 7ee408d4c61aab22b021705df6812d625bebfe57 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Wed, 29 Oct 2025 05:13:05 -0400 Subject: [PATCH] [Testing Only] Only fold flat offsets if they are inbounds PTRADDs Squash of the following upstream PRs for downstream testing: - https://github.com/llvm/llvm-project/pull/165424 - https://github.com/llvm/llvm-project/pull/165425 - https://github.com/llvm/llvm-project/pull/165426 - https://github.com/llvm/llvm-project/pull/165427 Regenerated outputs for the following tests to resolve merge conflicts: - llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll - llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll For SWDEV-516125. --- llvm/include/llvm/CodeGen/SelectionDAG.h | 12 +- llvm/include/llvm/CodeGen/TargetLowering.h | 30 +- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 26 +- .../CodeGen/SelectionDAG/TargetLowering.cpp | 25 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 141 +- .../branch-folding-implicit-def-subreg.ll | 28 +- .../AMDGPU/cgp-addressing-modes-flat.ll | 84 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 100 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 88 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 88 +- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 88 +- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 286 +- .../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 200 +- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 246 +- .../AMDGPU/flat_atomics_i64_noprivate.ll | 246 +- .../flat_atomics_i64_system_noprivate.ll | 200 +- llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll | 915 +- .../AMDGPU/infer-addrspace-flat-atomic.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll | 28 +- .../CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll | 28 +- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 49 +- .../CodeGen/AMDGPU/memintrinsic-unroll.ll | 10065 ++++++++-------- .../AMDGPU/memory-legalizer-flat-agent.ll | 120 +- .../AMDGPU/memory-legalizer-flat-cluster.ll | 120 +- .../memory-legalizer-flat-singlethread.ll | 120 +- .../AMDGPU/memory-legalizer-flat-system.ll | 120 +- .../AMDGPU/memory-legalizer-flat-wavefront.ll | 118 +- .../AMDGPU/memory-legalizer-flat-workgroup.ll | 112 +- llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll | 10 +- .../AMDGPU/no-folding-imm-to-inst-with-fi.ll | 50 +- llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 50 +- .../AMDGPU/preserve-wwm-copy-dst-reg.ll | 188 +- .../AMDGPU/promote-constOffset-to-imm.ll | 2 +- 33 files changed, 7483 insertions(+), 6516 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index df6ce0fe1b037..1a5ffb38f2568 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1113,7 +1113,8 @@ class SelectionDAG { SDValue Mask, SDValue EVL); /// Returns sum of the base pointer and offset. - /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default. + /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by + /// default. LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags = SDNodeFlags()); @@ -1123,15 +1124,18 @@ class SelectionDAG { /// Create an add instruction with appropriate flags when used for /// addressing some offset of an object. i.e. if a load is split into multiple - /// components, create an add nuw from the base pointer to the offset. + /// components, create an add nuw (or ptradd nuw inbounds) from the base + /// pointer to the offset. SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) { - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) { // The object itself can't wrap around the address space, so it shouldn't be // possible for the adds of the offsets to the split parts to overflow. - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } /// Return a new CALLSEQ_START node, that starts new call frame, in which diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 1920b98c8a1ef..78f63b4406eb0 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5649,17 +5649,35 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// Get a pointer to vector element \p Idx located in memory for a vector of /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of /// bounds the returned pointer is unspecified, but will be within the vector - /// bounds. - SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - SDValue Index) const; + /// bounds. \p PtrArithFlags can be used to mark that arithmetic within the + /// vector in memory is known to not wrap or to be inbounds. + SDValue getVectorElementPointer( + SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index, + const SDNodeFlags PtrArithFlags = SDNodeFlags()) const; + + /// Get a pointer to vector element \p Idx located in memory for a vector of + /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of + /// bounds the returned pointer is unspecified, but will be within the vector + /// bounds. \p VecPtr is guaranteed to point to the beginning of a memory + /// location large enough for the vector. + SDValue getInboundsVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, SDValue Index) const { + return getVectorElementPointer(DAG, VecPtr, VecVT, Index, + SDNodeFlags::NoUnsignedWrap | + SDNodeFlags::InBounds); + } /// Get a pointer to a sub-vector of type \p SubVecVT at index \p Idx located /// in memory for a vector of type \p VecVT starting at a base address of /// \p VecPtr. If \p Idx plus the size of \p SubVecVT is out of bounds the /// returned pointer is unspecified, but the value returned will be such that - /// the entire subvector would be within the vector bounds. - SDValue getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, - EVT SubVecVT, SDValue Index) const; + /// the entire subvector would be within the vector bounds. \p PtrArithFlags + /// can be used to mark that arithmetic within the vector in memory is known + /// to not wrap or to be inbounds. + SDValue + getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, + EVT SubVecVT, SDValue Index, + const SDNodeFlags PtrArithFlags = SDNodeFlags()) const; /// Method for building the DAG expansion of ISD::[US][MIN|MAX]. This /// method accepts integers as its arguments. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index e5691c1f2c184..9c965bb97a50c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2715,6 +2715,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags); AddToWorklist(Add.getNode()); + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds x, y))), z) + // The outer inbounds ptradd might therefore rely on a provenance that x + // does not have. return DAG.getMemBasePlusOffset(X, Add, DL, Flags); } } @@ -2740,6 +2746,12 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { // that. SDNodeFlags Flags = (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + // We can't set InBounds even if both original ptradds were InBounds and + // NUW: SDAG usually represents pointers as integers, therefore, the + // matched pattern behaves as if it had implicit casts: + // (ptradd inbounds (inttoptr (ptrtoint (ptradd inbounds GA, v))), c) + // The outer inbounds ptradd might therefore rely on a provenance that + // GA does not have. SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); AddToWorklist(Inner.getNode()); return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); @@ -2763,8 +2775,13 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) { bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); // If both additions in the original were NUW, reassociation preserves that. - SDNodeFlags ReassocFlags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDNodeFlags CommonFlags = N->getFlags() & N1->getFlags(); + SDNodeFlags ReassocFlags = CommonFlags & SDNodeFlags::NoUnsignedWrap; + if (CommonFlags.hasNoUnsignedWrap()) { + // If both operations are NUW and the PTRADD is inbounds, the offests are + // both non-negative, so the reassociated PTRADDs are also inbounds. + ReassocFlags |= N->getFlags() & SDNodeFlags::InBounds; + } if (ZIsConstant != YIsConstant) { if (YIsConstant) @@ -22745,7 +22762,10 @@ SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) { NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(COffset), DL); PointerInfo = ST->getPointerInfo().getWithOffset(COffset); } else { - NewPtr = TLI.getVectorElementPointer(DAG, Ptr, Value.getValueType(), Idx); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + NewPtr = TLI.getInboundsVectorElementPointer(DAG, Ptr, Value.getValueType(), + Idx); } return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(), diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index da4e40953b39a..9bdf82210fed1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10668,19 +10668,20 @@ static SDValue clampDynamicVectorIndex(SelectionDAG &DAG, SDValue Idx, DAG.getConstant(MaxIndex, dl, IdxVT)); } -SDValue TargetLowering::getVectorElementPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { return getVectorSubVecPointer( DAG, VecPtr, VecVT, EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(), 1), - Index); + Index, PtrArithFlags); } -SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, - SDValue VecPtr, EVT VecVT, - EVT SubVecVT, - SDValue Index) const { +SDValue +TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, SDValue VecPtr, + EVT VecVT, EVT SubVecVT, SDValue Index, + const SDNodeFlags PtrArithFlags) const { SDLoc dl(Index); // Make sure the index type is big enough to compute in. Index = DAG.getZExtOrTrunc(Index, dl, VecPtr.getValueType()); @@ -10704,7 +10705,7 @@ SDValue TargetLowering::getVectorSubVecPointer(SelectionDAG &DAG, Index = DAG.getNode(ISD::MUL, dl, IdxVT, Index, DAG.getConstant(EltSize, dl, IdxVT)); - return DAG.getMemBasePlusOffset(VecPtr, Index, dl); + return DAG.getMemBasePlusOffset(VecPtr, Index, dl, PtrArithFlags); } //===----------------------------------------------------------------------===// @@ -12382,8 +12383,10 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, !IsFast) return SDValue(); - SDValue NewPtr = - getVectorElementPointer(DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); + // The original DAG loaded the entire vector from memory, so arithmetic + // within it must be inbounds. + SDValue NewPtr = getInboundsVectorElementPointer( + DAG, OriginalLoad->getBasePtr(), InVecVT, EltNo); // We are replacing a vector load with a scalar load. The new load must have // identical memory op ordering to the original. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b8b419d93021a..f16eb1649be42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1828,72 +1828,83 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = + Addr.getOpcode() == ISD::PTRADD && Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 863177ae3d6b5..9a9d2ddce942b 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -64,7 +64,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = nsw V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr1, killed $vcc, 0, implicit $exec @@ -961,7 +961,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.71(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = nsw V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr4, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr5, killed $vcc, 0, implicit $exec @@ -1009,12 +1009,12 @@ bb: %i11 = icmp eq i32 %i, 0 %i12 = load i32, ptr addrspace(3) null, align 8 %i13 = zext i32 %i12 to i64 - %i14 = getelementptr i32, ptr addrspace(1) %arg, i64 %i13 + %i14 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %i13 br i1 %arg3, label %bb15, label %bb103 bb15: %i16 = zext i32 %i to i64 - %i17 = getelementptr i32, ptr addrspace(1) %i14, i64 %i16 + %i17 = getelementptr inbounds i32, ptr addrspace(1) %i14, i64 %i16 %i18 = ptrtoint ptr addrspace(1) %i17 to i64 br i1 %arg4, label %bb19, label %bb20 @@ -1023,7 +1023,7 @@ bb19: unreachable bb20: - %i21 = getelementptr i32, ptr addrspace(1) %i17, i64 256 + %i21 = getelementptr inbounds i32, ptr addrspace(1) %i17, i64 256 %i22 = ptrtoint ptr addrspace(1) %i21 to i64 %i23 = inttoptr i64 %i22 to ptr %i24 = load i8, ptr %i23, align 1 @@ -1035,7 +1035,7 @@ bb26: unreachable bb27: - %i28 = getelementptr i32, ptr addrspace(1) %i17, i64 512 + %i28 = getelementptr inbounds i32, ptr addrspace(1) %i17, i64 512 %i29 = ptrtoint ptr addrspace(1) %i28 to i64 %i30 = inttoptr i64 %i29 to ptr %i31 = load i8, ptr %i30, align 1 @@ -1047,7 +1047,7 @@ bb33: unreachable bb34: - %i35 = getelementptr i32, ptr addrspace(1) %i17, i64 768 + %i35 = getelementptr inbounds i32, ptr addrspace(1) %i17, i64 768 %i36 = ptrtoint ptr addrspace(1) %i35 to i64 %i37 = inttoptr i64 %i36 to ptr %i38 = load i8, ptr %i37, align 1 @@ -1059,7 +1059,7 @@ bb40: unreachable bb41: - %i42 = getelementptr i32, ptr addrspace(1) %i17, i64 1024 + %i42 = getelementptr inbounds i32, ptr addrspace(1) %i17, i64 1024 %i43 = ptrtoint ptr addrspace(1) %i42 to i64 %i44 = inttoptr i64 %i43 to ptr %i45 = load i8, ptr %i44, align 1 @@ -1071,7 +1071,7 @@ bb47: unreachable bb48: - %i49 = getelementptr i32, ptr addrspace(1) %i17, i64 1280 + %i49 = getelementptr inbounds i32, ptr addrspace(1) %i17, i64 1280 %i50 = ptrtoint ptr addrspace(1) %i49 to i64 %i51 = inttoptr i64 %i50 to ptr %i52 = load i8, ptr %i51, align 1 @@ -1083,7 +1083,7 @@ bb54: unreachable bb55: - %i56 = getelementptr i32, ptr addrspace(1) %i17, i64 1536 + %i56 = getelementptr inbounds i32, ptr addrspace(1) %i17, i64 1536 %i57 = ptrtoint ptr addrspace(1) %i56 to i64 %i58 = or i64 %i57, 1 %i59 = inttoptr i64 %i58 to ptr @@ -1115,7 +1115,7 @@ bb67: bb68: %i69 = zext i1 %arg5 to i8 - %i70 = getelementptr [2 x i32], ptr addrspace(1) null, i64 %i16 + %i70 = getelementptr inbounds [2 x i32], ptr addrspace(1) null, i64 %i16 %i71 = ptrtoint ptr addrspace(1) %i70 to i64 br i1 %arg5, label %bb72, label %bb73 @@ -1124,7 +1124,7 @@ bb72: unreachable bb73: - %i74 = getelementptr [2 x i32], ptr addrspace(1) %i70, i64 256 + %i74 = getelementptr inbounds [2 x i32], ptr addrspace(1) %i70, i64 256 %i75 = ptrtoint ptr addrspace(1) %i74 to i64 %i76 = inttoptr i64 %i75 to ptr %i77 = load i8, ptr %i76, align 1 @@ -1136,7 +1136,7 @@ bb79: unreachable bb80: - %i81 = getelementptr [2 x i32], ptr addrspace(1) %i70, i64 512 + %i81 = getelementptr inbounds [2 x i32], ptr addrspace(1) %i70, i64 512 %i82 = ptrtoint ptr addrspace(1) %i81 to i64 %i83 = or i64 %i82, 1 br i1 %arg6, label %bb84, label %bb85 @@ -1271,7 +1271,7 @@ bb174: %i182 = select i1 %arg3, i32 %i181, i32 0 %i183 = or i32 %i182, %i154 %i184 = or i32 %i183, %i156 - %i185 = getelementptr [2 x i32], ptr addrspace(1) %arg1, i64 %i13 + %i185 = getelementptr inbounds [2 x i32], ptr addrspace(1) %arg1, i64 %i13 br i1 %arg3, label %bb186, label %bb196 bb186: diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index 890f4f77ed107..e509d7b2b9b1b 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -12,8 +12,8 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; OPT-GFX7-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX7-NEXT: entry: -; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 -; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7 +; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 7 ; OPT-GFX7-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX7-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX7: if: @@ -28,8 +28,8 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; ; OPT-GFX8-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX8-NEXT: entry: -; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 -; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7 +; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 7 ; OPT-GFX8-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX8-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX8: if: @@ -44,11 +44,11 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; ; OPT-GFX9-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX9-NEXT: entry: -; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX9-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX9-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX9: if: -; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 28 +; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 28 ; OPT-GFX9-NEXT: [[LOAD:%.*]] = load i32, ptr [[SUNKADDR]], align 4 ; OPT-GFX9-NEXT: br label [[ENDIF]] ; OPT-GFX9: endif: @@ -58,11 +58,11 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; ; OPT-GFX10-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX10-NEXT: entry: -; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX10-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX10-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX10: if: -; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 28 +; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 28 ; OPT-GFX10-NEXT: [[LOAD:%.*]] = load i32, ptr [[SUNKADDR]], align 4 ; OPT-GFX10-NEXT: br label [[ENDIF]] ; OPT-GFX10: endif: @@ -146,8 +146,8 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 999999 - %in.gep = getelementptr i32, ptr %in, i64 7 + %out.gep = getelementptr inbounds i32, ptr %out, i64 999999 + %in.gep = getelementptr inbounds i32, ptr %in, i64 7 %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %endif, label %if @@ -167,12 +167,12 @@ done: define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, i32 %cond) { ; OPT-GFX7-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX7-NEXT: entry: -; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX7-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX7-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX7: if: ; OPT-GFX7-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1) -; OPT-GFX7-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28 +; OPT-GFX7-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 28 ; OPT-GFX7-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4 ; OPT-GFX7-NEXT: br label [[ENDIF]] ; OPT-GFX7: endif: @@ -182,8 +182,8 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; ; OPT-GFX8-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX8-NEXT: entry: -; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 -; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7 +; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 7 ; OPT-GFX8-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX8-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX8: if: @@ -197,12 +197,12 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; ; OPT-GFX9-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX9-NEXT: entry: -; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX9-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX9-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX9: if: ; OPT-GFX9-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1) -; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28 +; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 28 ; OPT-GFX9-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4 ; OPT-GFX9-NEXT: br label [[ENDIF]] ; OPT-GFX9: endif: @@ -212,12 +212,12 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; ; OPT-GFX10-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX10-NEXT: entry: -; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX10-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX10-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX10: if: ; OPT-GFX10-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1) -; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28 +; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 28 ; OPT-GFX10-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4 ; OPT-GFX10-NEXT: br label [[ENDIF]] ; OPT-GFX10: endif: @@ -303,8 +303,8 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 999999 - %in.gep = getelementptr i32, ptr %in, i64 7 + %out.gep = getelementptr inbounds i32, ptr %out, i64 999999 + %in.gep = getelementptr inbounds i32, ptr %in, i64 7 %cast = addrspacecast ptr %in.gep to ptr addrspace(1) %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %endif, label %if @@ -325,12 +325,12 @@ done: define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in, i32 %cond) { ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT: if: ; OPT-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(4) -; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i64 28 +; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP0]], i64 28 ; OPT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[SUNKADDR]], align 4 ; OPT-NEXT: br label [[ENDIF]] ; OPT: endif: @@ -416,8 +416,8 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 999999 - %in.gep = getelementptr i32, ptr %in, i64 7 + %out.gep = getelementptr inbounds i32, ptr %out, i64 999999 + %in.gep = getelementptr inbounds i32, ptr %in, i64 7 %cast = addrspacecast ptr %in.gep to ptr addrspace(4) %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %endif, label %if @@ -438,8 +438,8 @@ done: define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; OPT-GFX7-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX7-NEXT: entry: -; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX7-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX7-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX7-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -456,8 +456,8 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; ; OPT-GFX8-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX8-NEXT: entry: -; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX8-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX8-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX8-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -474,12 +474,12 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; ; OPT-GFX9-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX9-NEXT: entry: -; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 ; OPT-GFX9-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX9-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX9-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX9: if: -; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX9-NEXT: [[LOAD:%.*]] = load i8, ptr [[SUNKADDR]], align 1 ; OPT-GFX9-NEXT: [[CAST:%.*]] = sext i8 [[LOAD]] to i32 ; OPT-GFX9-NEXT: br label [[ENDIF]] @@ -490,8 +490,8 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; ; OPT-GFX10-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX10-NEXT: entry: -; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-GFX10-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX10-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX10-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX10-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -588,8 +588,8 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i32 1024 - %in.gep = getelementptr i8, ptr %in, i64 4095 + %out.gep = getelementptr inbounds i32, ptr %out, i32 1024 + %in.gep = getelementptr inbounds i8, ptr %in, i64 4095 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp0 = icmp eq i32 %tid, 0 br i1 %cmp0, label %endif, label %if @@ -611,8 +611,8 @@ done: define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; OPT-LABEL: @test_sink_flat_small_max_plus_1_flat_offset( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 99999 -; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4096 +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 99999 +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4096 ; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -711,8 +711,8 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 99999 - %in.gep = getelementptr i8, ptr %in, i64 4096 + %out.gep = getelementptr inbounds i32, ptr %out, i64 99999 + %in.gep = getelementptr inbounds i8, ptr %in, i64 4096 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp0 = icmp eq i32 %tid, 0 br i1 %cmp0, label %endif, label %if @@ -734,8 +734,8 @@ done: define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; OPT-LABEL: @test_sinkable_flat_reg_offset( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 [[REG:%.*]] +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 [[REG:%.*]] ; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3]] ; OPT-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -834,8 +834,8 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i32 1024 - %in.gep = getelementptr i8, ptr %in, i64 %reg + %out.gep = getelementptr inbounds i32, ptr %out, i32 1024 + %in.gep = getelementptr inbounds i8, ptr %in, i64 %reg %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp0 = icmp eq i32 %tid, 0 br i1 %cmp0, label %endif, label %if diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 1a4a54b81c78f..119056a72f2c6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -373,7 +373,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -570,7 +570,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -995,7 +995,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -1219,7 +1219,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -1409,7 +1409,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -1630,7 +1630,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -1795,7 +1795,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -1947,7 +1947,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2165,7 +2165,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -2330,7 +2330,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 ret void } @@ -2698,7 +2698,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2895,7 +2895,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -3320,7 +3320,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3544,7 +3544,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3734,7 +3734,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -3955,7 +3955,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4145,7 +4145,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -4366,7 +4366,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -6590,7 +6590,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -7052,7 +7052,7 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -7931,7 +7931,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8381,7 +8381,7 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9141,7 +9141,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -9526,7 +9526,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -10256,7 +10256,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10628,7 +10628,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10902,7 +10902,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11186,7 +11186,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -11574,7 +11574,7 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -11949,7 +11949,7 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -12872,7 +12872,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -13340,7 +13340,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -13792,7 +13792,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14245,7 +14245,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14618,7 +14618,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -14979,7 +14979,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15887,7 +15887,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -16343,7 +16343,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16726,7 +16726,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -16930,7 +16930,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -17297,7 +17297,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -17500,7 +17500,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -17694,7 +17694,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -17884,7 +17884,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -19273,7 +19273,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -19613,7 +19613,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -20246,7 +20246,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -20585,7 +20585,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -20914,7 +20914,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -21237,7 +21237,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 59b0537b817d2..0c592a2097896 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -301,7 +301,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -468,7 +468,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -757,7 +757,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -924,7 +924,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1076,7 +1076,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -1227,7 +1227,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1859,7 +1859,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2026,7 +2026,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2315,7 +2315,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2482,7 +2482,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2634,7 +2634,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2785,7 +2785,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3591,7 +3591,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -4004,7 +4004,7 @@ define double @flat_agent_atomic_fmax_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %result = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -4795,7 +4795,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5201,7 +5201,7 @@ define void @flat_agent_atomic_fmax_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %unused = atomicrmw fmax ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6824,7 +6824,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -7234,7 +7234,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -8009,7 +8009,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8406,7 +8406,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8710,7 +8710,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -9004,7 +9004,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9417,7 +9417,7 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -9817,7 +9817,7 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10742,7 +10742,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -11211,7 +11211,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -12102,7 +12102,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -12556,7 +12556,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -12930,7 +12930,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -13292,7 +13292,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13764,7 +13764,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -14221,7 +14221,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14700,7 +14700,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -14956,7 +14956,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -15417,7 +15417,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15670,7 +15670,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15912,7 +15912,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -16149,7 +16149,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -17050,7 +17050,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -17518,7 +17518,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -18389,7 +18389,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -18850,7 +18850,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -19304,7 +19304,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -19747,7 +19747,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c9c9f332fe391..d08fdc9809e19 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -301,7 +301,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -468,7 +468,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -757,7 +757,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -924,7 +924,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1076,7 +1076,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -1227,7 +1227,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1859,7 +1859,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2026,7 +2026,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2315,7 +2315,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2482,7 +2482,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2634,7 +2634,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2785,7 +2785,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3591,7 +3591,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -4004,7 +4004,7 @@ define double @flat_agent_atomic_fmin_ret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %result = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret double %result } @@ -4795,7 +4795,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5201,7 +5201,7 @@ define void @flat_agent_atomic_fmin_noret_f64__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %unused = atomicrmw fmin ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6824,7 +6824,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -7234,7 +7234,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -8009,7 +8009,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8406,7 +8406,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8710,7 +8710,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -9004,7 +9004,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9417,7 +9417,7 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -9817,7 +9817,7 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10742,7 +10742,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -11211,7 +11211,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -12102,7 +12102,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -12556,7 +12556,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -12930,7 +12930,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -13292,7 +13292,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13764,7 +13764,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -14221,7 +14221,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14700,7 +14700,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -14956,7 +14956,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -15417,7 +15417,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15670,7 +15670,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15912,7 +15912,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -16149,7 +16149,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -17050,7 +17050,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -17518,7 +17518,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -18389,7 +18389,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -18850,7 +18850,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -19304,7 +19304,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -19747,7 +19747,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 587c2ea885077..e5c967666c9d7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -405,7 +405,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -622,7 +622,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -1001,7 +1001,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -1213,7 +1213,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -1416,7 +1416,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val seq_cst ret float %result } @@ -1612,7 +1612,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val seq_cst ret void } @@ -2012,7 +2012,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -2229,7 +2229,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -2608,7 +2608,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -2820,7 +2820,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -3023,7 +3023,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val seq_cst ret float %result } @@ -3219,7 +3219,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val seq_cst ret void } @@ -4085,7 +4085,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst ret double %result } @@ -4532,7 +4532,7 @@ define double @flat_agent_atomic_fsub_ret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %result = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst ret double %result } @@ -5385,7 +5385,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_pos(ptr %ptr, double %v ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 255 + %gep = getelementptr inbounds double, ptr %ptr, i64 255 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst ret void } @@ -5822,7 +5822,7 @@ define void @flat_agent_atomic_fsub_noret_f64__offset12b_neg(ptr %ptr, double %v ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %ptr, i64 -256 + %gep = getelementptr inbounds double, ptr %ptr, i64 -256 %unused = atomicrmw fsub ptr %gep, double %val syncscope("agent") seq_cst ret void } @@ -6582,7 +6582,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst ret half %result } @@ -6967,7 +6967,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst ret half %result } @@ -7697,7 +7697,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst ret void } @@ -8069,7 +8069,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst ret void } @@ -8353,7 +8353,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 ret half %result } @@ -8627,7 +8627,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 ret void } @@ -9015,7 +9015,7 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val seq_cst ret half %result } @@ -9390,7 +9390,7 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, half %val seq_cst ret void } @@ -10313,7 +10313,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } @@ -10781,7 +10781,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst ret bfloat %result } @@ -11670,7 +11670,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst ret void } @@ -12123,7 +12123,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst ret void } @@ -12496,7 +12496,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret bfloat %result } @@ -12857,7 +12857,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret void } @@ -13328,7 +13328,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, bfloat %val seq_cst ret bfloat %result } @@ -13784,7 +13784,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, bfloat %val seq_cst ret void } @@ -14229,7 +14229,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } @@ -14468,7 +14468,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } @@ -14891,7 +14891,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret void } @@ -15125,7 +15125,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret void } @@ -15350,7 +15350,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst ret <2 x half> %result } @@ -15568,7 +15568,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst ret void } @@ -16469,7 +16469,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } @@ -16937,7 +16937,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } @@ -17808,7 +17808,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } @@ -18269,7 +18269,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } @@ -18723,7 +18723,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result } @@ -19166,7 +19166,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index 57be2907da4a0..b35f07002a48a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -64,7 +64,7 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -128,7 +128,7 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1023 + %gep = getelementptr inbounds i32, ptr %out, i32 1023 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -196,7 +196,7 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1024 + %gep = getelementptr inbounds i32, ptr %out, i32 1024 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -270,7 +270,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -352,8 +352,8 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -444,8 +444,8 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -652,7 +652,7 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -739,7 +739,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -804,7 +804,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -878,7 +878,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -960,8 +960,8 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1052,8 +1052,8 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -1260,7 +1260,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1347,7 +1347,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -1412,7 +1412,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1486,7 +1486,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -1568,8 +1568,8 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1660,8 +1660,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -1868,7 +1868,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1955,7 +1955,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -2016,7 +2016,7 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2089,7 +2089,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -2167,8 +2167,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2258,8 +2258,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -2457,7 +2457,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2543,7 +2543,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -2604,7 +2604,7 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2677,7 +2677,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -2755,8 +2755,8 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2846,8 +2846,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -3045,7 +3045,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3131,7 +3131,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -3192,7 +3192,7 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3265,7 +3265,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -3343,8 +3343,8 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3434,8 +3434,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -3633,7 +3633,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3719,7 +3719,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -3780,7 +3780,7 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -3931,8 +3931,8 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4022,8 +4022,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -4221,7 +4221,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4307,7 +4307,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -4372,7 +4372,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4446,7 +4446,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -4528,8 +4528,8 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4620,8 +4620,8 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -4828,7 +4828,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4915,7 +4915,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -4980,7 +4980,7 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -5044,7 +5044,7 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst ret void } @@ -5118,7 +5118,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -5200,8 +5200,8 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -5292,8 +5292,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -5500,7 +5500,7 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -5587,7 +5587,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -5652,7 +5652,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5729,7 +5729,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 store i32 %flag, ptr %out2 @@ -5819,8 +5819,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -5918,8 +5918,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 store i32 %flag, ptr %out2 @@ -6136,7 +6136,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -6230,7 +6230,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 store i32 %flag, ptr %out2 @@ -6296,7 +6296,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6370,7 +6370,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -6452,8 +6452,8 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6544,8 +6544,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -6752,7 +6752,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6839,7 +6839,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -6905,7 +6905,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %in, i32 4 + %gep = getelementptr inbounds i32, ptr %in, i32 4 %val = load atomic i32, ptr %gep seq_cst, align 4 store i32 %val, ptr %out ret void @@ -7050,8 +7050,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %in, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %in, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = load atomic i32, ptr %gep seq_cst, align 4 store i32 %val, ptr %out ret void @@ -7131,7 +7131,7 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %in, i64 %index + %ptr = getelementptr inbounds i32, ptr %in, i64 %index %val = load atomic i32, ptr %ptr seq_cst, align 4 store i32 %val, ptr %out ret void @@ -7186,7 +7186,7 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 store atomic i32 %in, ptr %gep seq_cst, align 4 ret void } @@ -7302,8 +7302,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 store atomic i32 %in, ptr %gep seq_cst, align 4 ret void } @@ -7366,7 +7366,7 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index store atomic i32 %in, ptr %ptr seq_cst, align 4 ret void } @@ -7431,7 +7431,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr float, ptr %in, i32 4 + %gep = getelementptr inbounds float, ptr %in, i32 4 %val = load atomic float, ptr %gep seq_cst, align 4 store float %val, ptr %out ret void @@ -7576,8 +7576,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %in, i64 %index - %gep = getelementptr float, ptr %ptr, i32 4 + %ptr = getelementptr inbounds float, ptr %in, i64 %index + %gep = getelementptr inbounds float, ptr %ptr, i32 4 %val = load atomic float, ptr %gep seq_cst, align 4 store float %val, ptr %out ret void @@ -7657,7 +7657,7 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %in, i64 %index + %ptr = getelementptr inbounds float, ptr %in, i64 %index %val = load atomic float, ptr %ptr seq_cst, align 4 store float %val, ptr %out ret void @@ -7712,7 +7712,7 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 store atomic float %in, ptr %gep seq_cst, align 4 ret void } @@ -7828,8 +7828,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:16 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %out, i64 %index - %gep = getelementptr float, ptr %ptr, i32 4 + %ptr = getelementptr inbounds float, ptr %out, i64 %index + %gep = getelementptr inbounds float, ptr %ptr, i32 4 store atomic float %in, ptr %gep seq_cst, align 4 ret void } @@ -7892,7 +7892,7 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %out, i64 %index + %ptr = getelementptr inbounds float, ptr %out, i64 %index store atomic float %in, ptr %ptr seq_cst, align 4 ret void } @@ -7971,7 +7971,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %gep = getelementptr i8, ptr %in, i64 16 + %gep = getelementptr inbounds i8, ptr %in, i64 16 %val = load atomic i8, ptr %gep seq_cst, align 1 store i8 %val, ptr %out ret void @@ -8145,8 +8145,8 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %ptr = getelementptr i8, ptr %in, i64 %index - %gep = getelementptr i8, ptr %ptr, i64 16 + %ptr = getelementptr inbounds i8, ptr %in, i64 %index + %gep = getelementptr inbounds i8, ptr %ptr, i64 16 %val = load atomic i8, ptr %gep seq_cst, align 1 store i8 %val, ptr %out ret void @@ -8212,7 +8212,7 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %gep = getelementptr i8, ptr %out, i64 16 + %gep = getelementptr inbounds i8, ptr %out, i64 16 store atomic i8 %in, ptr %gep seq_cst, align 1 ret void } @@ -8348,8 +8348,8 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; GFX11-FAKE16-NEXT: flat_store_b8 v[0:1], v2 offset:16 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %ptr = getelementptr i8, ptr %out, i64 %index - %gep = getelementptr i8, ptr %ptr, i64 16 + %ptr = getelementptr inbounds i8, ptr %out, i64 %index + %gep = getelementptr inbounds i8, ptr %ptr, i64 16 store atomic i8 %in, ptr %gep seq_cst, align 1 ret void } @@ -8428,7 +8428,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %gep = getelementptr i16, ptr %in, i64 8 + %gep = getelementptr inbounds i16, ptr %in, i64 8 %val = load atomic i16, ptr %gep seq_cst, align 2 store i16 %val, ptr %out ret void @@ -8607,8 +8607,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %ptr = getelementptr i16, ptr %in, i64 %index - %gep = getelementptr i16, ptr %ptr, i64 8 + %ptr = getelementptr inbounds i16, ptr %in, i64 %index + %gep = getelementptr inbounds i16, ptr %ptr, i64 8 %val = load atomic i16, ptr %gep seq_cst, align 2 store i16 %val, ptr %out ret void @@ -8674,7 +8674,7 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %gep = getelementptr i16, ptr %out, i64 8 + %gep = getelementptr inbounds i16, ptr %out, i64 8 store atomic i16 %in, ptr %gep seq_cst, align 2 ret void } @@ -8816,8 +8816,8 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %ptr = getelementptr i16, ptr %out, i64 %index - %gep = getelementptr i16, ptr %ptr, i64 8 + %ptr = getelementptr inbounds i16, ptr %out, i64 %index + %gep = getelementptr inbounds i16, ptr %ptr, i64 8 store atomic i16 %in, ptr %gep seq_cst, align 2 ret void } @@ -8882,7 +8882,7 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 offset:16 ; GFX11-FAKE16-NEXT: s_endpgm entry: - %gep = getelementptr half, ptr %out, i64 8 + %gep = getelementptr inbounds half, ptr %out, i64 8 store atomic half %in, ptr %gep seq_cst, align 2 ret void } @@ -9002,7 +9002,7 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-FAKE16-NEXT: s_endpgm - %gep = getelementptr bfloat, ptr %out, i64 8 + %gep = getelementptr inbounds bfloat, ptr %out, i64 8 store atomic bfloat %in, ptr %out seq_cst, align 2 ret void } @@ -9125,7 +9125,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9189,7 +9189,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1023 + %gep = getelementptr inbounds i32, ptr %out, i32 1023 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9257,7 +9257,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1024 + %gep = getelementptr inbounds i32, ptr %out, i32 1024 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9331,7 +9331,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -9413,8 +9413,8 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9505,8 +9505,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -9713,7 +9713,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9800,7 +9800,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -9865,7 +9865,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9929,7 +9929,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1023 + %gep = getelementptr inbounds i32, ptr %out, i32 1023 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9997,7 +9997,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1024 + %gep = getelementptr inbounds i32, ptr %out, i32 1024 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10071,7 +10071,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -10153,8 +10153,8 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10245,8 +10245,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -10453,7 +10453,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10540,7 +10540,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i32 %val, ptr %out2 ret void @@ -10619,7 +10619,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-FAKE16-NEXT: s_endpgm - %gep = getelementptr half, ptr %in, i64 8 + %gep = getelementptr inbounds half, ptr %in, i64 8 %val = load atomic half, ptr %gep seq_cst, align 2 store half %val, ptr %out ret void @@ -10772,7 +10772,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-FAKE16-NEXT: flat_store_b16 v[0:1], v2 ; GFX11-FAKE16-NEXT: s_endpgm - %gep = getelementptr bfloat, ptr %in, i64 8 + %gep = getelementptr inbounds bfloat, ptr %in, i64 8 %val = load atomic bfloat, ptr %gep seq_cst, align 2 store bfloat %val, ptr %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 47161954cc332..45f9d9e774079 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -63,7 +63,7 @@ define void @flat_atomic_xchg_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst ret void } @@ -124,7 +124,7 @@ define i32 @flat_atomic_xchg_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst ret i32 %result } @@ -203,7 +203,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst ret void } @@ -282,7 +282,7 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst ret i32 %result } @@ -315,7 +315,7 @@ define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -348,7 +348,7 @@ define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -413,7 +413,7 @@ define void @flat_atomic_xchg_f32_noret_offset(ptr %out, float %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst ret void } @@ -474,7 +474,7 @@ define float @flat_atomic_xchg_f32_ret_offset(ptr %out, float %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst ret float %result } @@ -553,7 +553,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst ret void } @@ -632,7 +632,7 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst ret float %result } @@ -665,7 +665,7 @@ define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i64 4 + %gep = getelementptr inbounds float, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -698,7 +698,7 @@ define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i64 4 + %gep = getelementptr inbounds float, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 ret float %result } @@ -763,7 +763,7 @@ define void @flat_atomic_add_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst ret void } @@ -824,7 +824,7 @@ define i32 @flat_atomic_add_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst ret i32 %result } @@ -903,7 +903,7 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst ret void } @@ -982,7 +982,7 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1015,7 +1015,7 @@ define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1048,7 +1048,7 @@ define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -1191,7 +1191,7 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst ret void } @@ -1334,7 +1334,7 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1485,7 +1485,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst ret void } @@ -1644,7 +1644,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1677,7 +1677,7 @@ define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1710,7 +1710,7 @@ define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -1853,7 +1853,7 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst ret void } @@ -1996,7 +1996,7 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst ret i32 %result } @@ -2147,7 +2147,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst ret void } @@ -2306,7 +2306,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst ret i32 %result } @@ -2339,7 +2339,7 @@ define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2372,7 +2372,7 @@ define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -2521,7 +2521,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst ret void } @@ -2670,7 +2670,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst ret i32 %result } @@ -2827,7 +2827,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst ret void } @@ -2992,7 +2992,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst ret i32 %result } @@ -3067,7 +3067,7 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3143,7 +3143,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -3286,7 +3286,7 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst ret void } @@ -3429,7 +3429,7 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst ret i32 %result } @@ -3580,7 +3580,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst ret void } @@ -3739,7 +3739,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst ret i32 %result } @@ -3772,7 +3772,7 @@ define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3805,7 +3805,7 @@ define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -3948,7 +3948,7 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst ret void } @@ -4091,7 +4091,7 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst ret i32 %result } @@ -4242,7 +4242,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst ret void } @@ -4401,7 +4401,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst ret i32 %result } @@ -4434,7 +4434,7 @@ define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4467,7 +4467,7 @@ define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -4610,7 +4610,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst ret void } @@ -4753,7 +4753,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst ret i32 %result } @@ -4904,7 +4904,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst ret void } @@ -5063,7 +5063,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst ret i32 %result } @@ -5157,8 +5157,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst ret void } @@ -5267,8 +5267,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -5359,7 +5359,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst ret void } @@ -5464,7 +5464,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -5498,7 +5498,7 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5531,7 +5531,7 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -5674,7 +5674,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst ret void } @@ -5817,7 +5817,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst ret i32 %result } @@ -5968,7 +5968,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst ret void } @@ -6127,7 +6127,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst ret i32 %result } @@ -6221,8 +6221,8 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst ret void } @@ -6331,8 +6331,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -6438,7 +6438,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -6472,7 +6472,7 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -6505,7 +6505,7 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -6648,7 +6648,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst ret void } @@ -6791,7 +6791,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst ret i32 %result } @@ -6942,7 +6942,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst ret void } @@ -7101,7 +7101,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst ret i32 %result } @@ -7134,7 +7134,7 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -7167,7 +7167,7 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -7310,7 +7310,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst ret void } @@ -7453,7 +7453,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst ret i32 %result } @@ -7604,7 +7604,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst ret void } @@ -7763,7 +7763,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst ret i32 %result } @@ -7857,8 +7857,8 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst ret void } @@ -7967,8 +7967,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -8151,7 +8151,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -8185,7 +8185,7 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8218,7 +8218,7 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -8373,7 +8373,7 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret void } @@ -8528,7 +8528,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -8691,7 +8691,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret void } @@ -8862,7 +8862,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -8895,7 +8895,7 @@ define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -8928,7 +8928,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -9089,7 +9089,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret void } @@ -9250,7 +9250,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -9425,7 +9425,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret void } @@ -9608,7 +9608,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -9641,7 +9641,7 @@ define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -9674,7 +9674,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 1f105e8dd8ba5..6dfe4594a248c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -138,7 +138,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst ret void } @@ -284,7 +284,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 ret void @@ -438,8 +438,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst ret void } @@ -590,8 +590,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 ret void @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst ret void } @@ -1161,7 +1161,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 ret void @@ -1299,7 +1299,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1442,7 +1442,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -1593,8 +1593,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1742,8 +1742,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -2158,7 +2158,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2301,7 +2301,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -2442,7 +2442,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2588,7 +2588,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -2742,8 +2742,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2894,8 +2894,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -3319,7 +3319,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3465,7 +3465,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -3606,7 +3606,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3754,7 +3754,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -3908,8 +3908,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4062,8 +4062,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -4489,7 +4489,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4637,7 +4637,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -4778,7 +4778,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4926,7 +4926,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -5080,8 +5080,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5234,8 +5234,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -5661,7 +5661,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -5809,7 +5809,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -5950,7 +5950,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6098,7 +6098,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -6252,8 +6252,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6406,8 +6406,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -6833,7 +6833,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -6981,7 +6981,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -7122,7 +7122,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7270,7 +7270,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -7424,8 +7424,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -7578,8 +7578,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -8005,7 +8005,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8153,7 +8153,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -8291,7 +8291,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8434,7 +8434,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -8585,8 +8585,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8734,8 +8734,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -9150,7 +9150,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -9293,7 +9293,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -9420,7 +9420,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst ret void } @@ -9546,7 +9546,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst ret void } @@ -9672,7 +9672,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr ptr, ptr %out, i32 4 + %gep = getelementptr inbounds ptr, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst ret void } @@ -9812,7 +9812,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 ret void @@ -9952,8 +9952,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst ret void } @@ -10098,8 +10098,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 ret void @@ -10489,7 +10489,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst ret void } @@ -10629,7 +10629,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst store i64 %tmp0, ptr %out2 ret void @@ -10767,7 +10767,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10910,7 +10910,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -11061,8 +11061,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11210,8 +11210,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -11626,7 +11626,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11769,7 +11769,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -11820,7 +11820,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %in, i64 4 + %gep = getelementptr inbounds i64, ptr %in, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -11930,8 +11930,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %in, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -11991,7 +11991,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index + %ptr = getelementptr inbounds i64, ptr %in, i64 %index %val = load atomic i64, ptr %ptr seq_cst, align 8 store i64 %val, ptr %out ret void @@ -12035,7 +12035,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -12129,8 +12129,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -12182,7 +12182,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index store atomic i64 %in, ptr %ptr seq_cst, align 8 ret void } @@ -12333,7 +12333,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst ret void } @@ -12484,7 +12484,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 9000 + %gep = getelementptr inbounds i64, ptr %out, i64 9000 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst ret void } @@ -12633,7 +12633,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -12791,8 +12791,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst ret void } @@ -12954,8 +12954,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -13398,7 +13398,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst ret void } @@ -13555,7 +13555,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -13607,7 +13607,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %in, i64 4 + %gep = getelementptr inbounds double, ptr %in, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8 store double %val, ptr %out ret void @@ -13717,8 +13717,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %in, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %in, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8 store double %val, ptr %out ret void @@ -13778,7 +13778,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %in, i64 %index + %ptr = getelementptr inbounds double, ptr %in, i64 %index %val = load atomic double, ptr %ptr seq_cst, align 8 store double %val, ptr %out ret void @@ -13822,7 +13822,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 store atomic double %in, ptr %gep seq_cst, align 8 ret void } @@ -13916,8 +13916,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %out, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %out, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 store atomic double %in, ptr %gep seq_cst, align 8 ret void } @@ -13969,7 +13969,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %out, i64 %index + %ptr = getelementptr inbounds double, ptr %out, i64 %index store atomic double %in, ptr %ptr seq_cst, align 8 ret void } @@ -14116,7 +14116,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14269,7 +14269,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -14430,8 +14430,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14589,8 +14589,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -15035,7 +15035,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15188,7 +15188,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -15345,7 +15345,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15509,7 +15509,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -15679,8 +15679,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15849,8 +15849,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void @@ -16324,7 +16324,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16488,7 +16488,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 store i64 %tmp0, ptr %out2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index 757649ca592b3..e5187a811a230 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -45,7 +45,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -104,7 +104,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -165,8 +165,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -230,8 +230,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -385,7 +385,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -445,7 +445,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -517,7 +517,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -602,7 +602,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -687,8 +687,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -778,8 +778,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1015,7 +1015,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1101,7 +1101,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1175,7 +1175,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1262,7 +1262,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1349,8 +1349,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1442,8 +1442,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1685,7 +1685,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1773,7 +1773,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1849,7 +1849,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -1938,7 +1938,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2027,8 +2027,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2122,8 +2122,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2371,7 +2371,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2461,7 +2461,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2537,7 +2537,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2626,7 +2626,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2715,8 +2715,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2810,8 +2810,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3059,7 +3059,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3149,7 +3149,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3225,7 +3225,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3314,7 +3314,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3403,8 +3403,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3498,8 +3498,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3747,7 +3747,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3837,7 +3837,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3913,7 +3913,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -4002,7 +4002,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4091,8 +4091,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -4186,8 +4186,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4435,7 +4435,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -4525,7 +4525,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4597,7 +4597,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4682,7 +4682,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4767,8 +4767,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4858,8 +4858,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5095,7 +5095,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5181,7 +5181,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5229,7 +5229,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5276,7 +5276,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5323,7 +5323,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr ptr, ptr %out, i32 4 + %gep = getelementptr inbounds ptr, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5382,7 +5382,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5443,8 +5443,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5508,8 +5508,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5663,7 +5663,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5723,7 +5723,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5795,7 +5795,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -5880,7 +5880,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -5965,8 +5965,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6056,8 +6056,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6293,7 +6293,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6379,7 +6379,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6430,7 +6430,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %in, i64 4 + %gep = getelementptr inbounds i64, ptr %in, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6540,8 +6540,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %in, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6601,7 +6601,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index + %ptr = getelementptr inbounds i64, ptr %in, i64 %index %val = load atomic i64, ptr %ptr seq_cst, align 8 store i64 %val, ptr %out ret void @@ -6645,7 +6645,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -6739,8 +6739,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -6792,7 +6792,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index store atomic i64 %in, ptr %ptr seq_cst, align 8 ret void } @@ -6848,7 +6848,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -6904,7 +6904,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 9000 + %gep = getelementptr inbounds i64, ptr %out, i64 9000 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -6964,7 +6964,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -7026,8 +7026,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -7099,8 +7099,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -7266,7 +7266,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -7334,7 +7334,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -7386,7 +7386,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %in, i64 4 + %gep = getelementptr inbounds double, ptr %in, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7496,8 +7496,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %in, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %in, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7557,7 +7557,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %in, i64 %index + %ptr = getelementptr inbounds double, ptr %in, i64 %index %val = load atomic double, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -7601,7 +7601,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7695,8 +7695,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %out, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %out, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7748,7 +7748,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %out, i64 %index + %ptr = getelementptr inbounds double, ptr %out, i64 %index store atomic double %in, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -7825,7 +7825,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -7916,7 +7916,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8007,8 +8007,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8104,8 +8104,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8355,7 +8355,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8447,7 +8447,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8533,7 +8533,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8632,7 +8632,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -8731,8 +8731,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -8836,8 +8836,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -9111,7 +9111,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -9211,7 +9211,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index 4dea4495b36fb..f655d4761fa31 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -63,7 +63,7 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -124,7 +124,7 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -209,7 +209,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -294,7 +294,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -327,7 +327,7 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -360,7 +360,7 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -425,7 +425,7 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -486,7 +486,7 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -571,7 +571,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -656,7 +656,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -689,7 +689,7 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -722,7 +722,7 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret double %result } @@ -787,7 +787,7 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -848,7 +848,7 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -933,7 +933,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1018,7 +1018,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1051,7 +1051,7 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1084,7 +1084,7 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -1251,7 +1251,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1422,7 +1422,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1619,7 +1619,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1816,7 +1816,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1849,7 +1849,7 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1882,7 +1882,7 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -2049,7 +2049,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2220,7 +2220,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2411,7 +2411,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2602,7 +2602,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2635,7 +2635,7 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -2668,7 +2668,7 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -2847,7 +2847,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3030,7 +3030,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3233,7 +3233,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3436,7 +3436,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3526,7 +3526,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -3618,7 +3618,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -3785,7 +3785,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3956,7 +3956,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4147,7 +4147,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4338,7 +4338,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4371,7 +4371,7 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -4404,7 +4404,7 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -4571,7 +4571,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4742,7 +4742,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4933,7 +4933,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5124,7 +5124,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5157,7 +5157,7 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -5190,7 +5190,7 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -5363,7 +5363,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5540,7 +5540,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5749,7 +5749,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5958,7 +5958,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6064,8 +6064,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6180,8 +6180,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -6284,7 +6284,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6395,7 +6395,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -6429,7 +6429,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -6462,7 +6462,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -6635,7 +6635,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6812,7 +6812,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7021,7 +7021,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7230,7 +7230,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7336,8 +7336,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7452,8 +7452,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -7565,7 +7565,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -7599,7 +7599,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -7632,7 +7632,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -7805,7 +7805,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7982,7 +7982,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8191,7 +8191,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8400,7 +8400,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8433,7 +8433,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -8466,7 +8466,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -8639,7 +8639,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8816,7 +8816,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9025,7 +9025,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9234,7 +9234,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9340,8 +9340,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9456,8 +9456,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -9664,7 +9664,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -9698,7 +9698,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -9731,7 +9731,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -9916,7 +9916,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -10105,7 +10105,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10314,7 +10314,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -10523,7 +10523,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -10556,7 +10556,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -10589,7 +10589,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -10786,7 +10786,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -10987,7 +10987,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -11220,7 +11220,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -11453,7 +11453,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -11486,7 +11486,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -11519,7 +11519,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 9c49aade6099f..614500287339b 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -24,96 +24,82 @@ ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) -; FIXME the offset here should not be folded: if %p points to the beginning of +; The offset here cannot be folded: if %p points to the beginning of scratch or ; scratch or LDS and %i is -1, a folded offset crashes the program. define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { -; GFX90A-SDAG-LABEL: flat_offset_maybe_oob: -; GFX90A-SDAG: ; %bb.0: -; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: flat_offset_maybe_oob: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-SDAG-LABEL: flat_offset_maybe_oob: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: flat_offset_maybe_oob: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-SDAG-LABEL: flat_offset_maybe_oob: ; GFX942-SDAG: ; %bb.0: ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 12 +; GFX942-SDAG-NEXT: flat_load_dword v0, v[0:1] ; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: flat_offset_maybe_oob: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-SDAG-LABEL: flat_offset_maybe_oob: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 -; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-SDAG-NEXT: flat_load_b32 v0, v[0:1] offset:12 -; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX90A-GISEL-LABEL: flat_offset_maybe_oob: -; GFX90A-GISEL: ; %bb.0: -; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 -; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-GISEL-NEXT: flat_load_dword v0, v[0:1] -; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: flat_offset_maybe_oob: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: flat_offset_maybe_oob: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX10-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX12-LABEL: flat_offset_maybe_oob: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] ; ; GFX942-GISEL-LABEL: flat_offset_maybe_oob: ; GFX942-GISEL: ; %bb.0: @@ -126,44 +112,6 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX942-GISEL-NEXT: flat_load_dword v0, v[0:1] ; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: flat_offset_maybe_oob: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-GISEL-LABEL: flat_offset_maybe_oob: -; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 -; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 -; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 -; GFX12-GISEL-NEXT: s_wait_alu 0xfffd -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] %idx = add nsw i32 %i, 3 %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx %l = load i32, ptr %arrayidx @@ -273,13 +221,742 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { %l = load i32, ptr addrspace(5) %arrayidx ret i32 %l } + +; If the GEP that adds the offset is inbounds, folding the offset is legal. +define i32 @flat_offset_inbounds(ptr %p, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load i32, ptr %arrayidx + ret i32 %l +} + +define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX90A-SDAG: ; %bb.0: +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-NEXT: v_add_co_u32_e32 v8, vcc, 28, v0 +; GFX90A-SDAG-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc +; GFX90A-SDAG-NEXT: flat_load_dword v10, v[8:9] +; GFX90A-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 +; GFX90A-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: s_clause 0x1 +; GFX10-SDAG-NEXT: flat_load_dword v8, v[4:5] +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-SDAG-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX942-SDAG: ; %bb.0: +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[8:9], v[0:1], 0, 28 +; GFX942-SDAG-NEXT: flat_load_dword v10, v[8:9] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: flat_store_dword v[2:3], v10 offset:16 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: flat_load_b32 v8, v[4:5] +; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_offset_inbounds_wide: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 28 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: flat_load_b32 v8, v[4:5] +; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-SDAG-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-GISEL-NEXT: flat_load_dword v0, v[0:1] offset:28 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-GISEL-NEXT: flat_store_dword v[2:3], v0 offset:16 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-GISEL-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-GISEL-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_offset_inbounds_wide: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x1 +; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-GISEL-NEXT: flat_load_b32 v0, v[0:1] offset:28 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-GISEL-NEXT: flat_store_b32 v[2:3], v0 offset:16 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <5 x i32>, ptr %arrayidx + store <5 x i32> %l, ptr %pout + ret void +} + +define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-SDAG-MUBUF-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-SDAG-MUBUF: ; %bb.0: +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-MUBUF-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v24, s[4:5], 28, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v25, s[4:5], 0, v1, s[4:5] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e64 v28, s[4:5], 44, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v36, vcc, 0x8c, v0 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e64 v29, s[4:5], 0, v1, s[4:5] +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[4:7], v[28:29] offset:16 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[8:11], v[28:29] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[12:15], v[28:29] offset:48 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[16:19], v[28:29] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[20:23], v[28:29] offset:80 +; GFX90A-SDAG-MUBUF-NEXT: s_nop 0 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GFX90A-SDAG-MUBUF-NEXT: s_nop 0 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v1, vcc +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:12 +; GFX90A-SDAG-MUBUF-NEXT: flat_load_dwordx4 v[48:51], v[36:37] +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX90A-SDAG-MUBUF-NEXT: v_add_co_u32_e32 v36, vcc, 0x88, v2 +; GFX90A-SDAG-MUBUF-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v3, vcc +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:48 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:64 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:32 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:16 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dword v[36:37], v50 +; GFX90A-SDAG-MUBUF-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 +; GFX90A-SDAG-MUBUF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-SDAG-FLATSCR-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-SDAG-FLATSCR: ; %bb.0: +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-SDAG-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v24, s[0:1], 28, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v25, s[0:1], 0, v1, s[0:1] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e64 v28, s[0:1], 44, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v36, vcc, 0x8c, v0 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e64 v29, s[0:1], 0, v1, s[0:1] +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[4:7], v[28:29] offset:16 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[8:11], v[28:29] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[12:15], v[28:29] offset:48 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[16:19], v[28:29] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[20:23], v[28:29] offset:80 +; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GFX90A-SDAG-FLATSCR-NEXT: s_nop 0 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v1, vcc +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:12 +; GFX90A-SDAG-FLATSCR-NEXT: flat_load_dwordx4 v[48:51], v[36:37] +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX90A-SDAG-FLATSCR-NEXT: v_add_co_u32_e32 v36, vcc, 0x88, v2 +; GFX90A-SDAG-FLATSCR-NEXT: v_addc_co_u32_e32 v37, vcc, 0, v3, vcc +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:48 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[20:23] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:64 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[12:15] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:32 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:16 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dword v[36:37], v50 +; GFX90A-SDAG-FLATSCR-NEXT: flat_store_dwordx2 v[2:3], v[48:49] offset:128 +; GFX90A-SDAG-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-SDAG-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX10-SDAG-NEXT: s_clause 0x8 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[36:37] offset:80 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[36:37] offset:96 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[36:37] offset:48 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[36:37] offset:64 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[36:37] offset:16 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[36:37] offset:32 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[36:37] +; GFX10-SDAG-NEXT: flat_load_dwordx4 v[36:39], v[36:37] offset:112 +; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX10-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:48 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:64 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[16:19] offset:32 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[24:27] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-SDAG-NEXT: flat_store_dword v[48:49], v38 +; GFX10-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[36:37] offset:128 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX942-SDAG: ; %bb.0: +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x5c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[30:31], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x4c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[16:17], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x7c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[18:19], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x6c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[24:25], v[0:1], 0, 28 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[28:29], v[0:1], 0, 60 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[26:27], v[0:1], 0, 44 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[20:21], v[0:1], 0, s[2:3] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[4:7], v[20:21] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[8:11], v[16:17] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[12:15], v[18:19] +; GFX942-SDAG-NEXT: ; kill: killed $vgpr16_vgpr17 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr18_vgpr19 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr20_vgpr21 +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[16:19], v[26:27] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[20:23], v[30:31] +; GFX942-SDAG-NEXT: ; kill: killed $vgpr30_vgpr31 +; GFX942-SDAG-NEXT: ; kill: killed $vgpr26_vgpr27 +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[24:27], v[24:25] +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[28:31], v[28:29] +; GFX942-SDAG-NEXT: s_nop 0 +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:12 +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x8c +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX942-SDAG-NEXT: flat_load_dwordx4 v[52:55], v[0:1] +; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0x60 +; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70 +; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x50 +; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x88 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, s[0:1] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[2:3], 0, s[2:3] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[2:3], 0, s[4:5] +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[48:49], v[2:3], 0, 48 +; GFX942-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[2:3], 0, s[6:7] +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[12:15] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:64 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[20:23] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:32 +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[48:49], v[28:31] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[32:35] +; GFX942-SDAG-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:16 +; GFX942-SDAG-NEXT: flat_store_dword v[50:51], v54 +; GFX942-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[52:53] offset:128 +; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x7 +; GFX11-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 +; GFX11-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 +; GFX11-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 +; GFX11-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 +; GFX11-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 +; GFX11-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] +; GFX11-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX11-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 +; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX11-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX11-SDAG-NEXT: s_clause 0x7 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 +; GFX11-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX11-SDAG-NEXT: flat_store_b32 v[48:49], v34 +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 +; GFX11-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-SDAG-LABEL: flat_offset_inbounds_very_wide: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v36, vcc_lo, v0, 28 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v1, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x7 +; GFX12-SDAG-NEXT: flat_load_b128 v[4:7], v[36:37] offset:80 +; GFX12-SDAG-NEXT: flat_load_b128 v[8:11], v[36:37] offset:96 +; GFX12-SDAG-NEXT: flat_load_b128 v[12:15], v[36:37] offset:64 +; GFX12-SDAG-NEXT: flat_load_b128 v[16:19], v[36:37] offset:32 +; GFX12-SDAG-NEXT: flat_load_b128 v[20:23], v[36:37] offset:16 +; GFX12-SDAG-NEXT: flat_load_b128 v[24:27], v[36:37] +; GFX12-SDAG-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX12-SDAG-NEXT: flat_load_b128 v[32:35], v[36:37] offset:112 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG-NEXT: flat_load_b128 v[35:38], v[36:37] offset:48 +; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v2, 48 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX12-SDAG-NEXT: v_add_co_u32 v48, vcc_lo, 0x88, v2 +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v49, null, 0, v3, vcc_lo +; GFX12-SDAG-NEXT: s_clause 0x7 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[4:7] offset:48 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[8:11] offset:64 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[12:15] offset:32 +; GFX12-SDAG-NEXT: flat_store_b128 v[0:1], v[16:19] +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[24:27] offset:16 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX12-SDAG-NEXT: flat_store_b32 v[48:49], v34 +; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-SDAG-NEXT: s_clause 0x1 +; GFX12-SDAG-NEXT: flat_store_b128 v[2:3], v[35:38] offset:64 +; GFX12-SDAG-NEXT: flat_store_b64 v[2:3], v[32:33] offset:128 +; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX90A-GISEL: ; %bb.0: +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX90A-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX90A-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX90A-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX90A-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX90A-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX10-GISEL-NEXT: s_clause 0x8 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX10-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX10-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX942-GISEL: ; %bb.0: +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:28 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:44 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:60 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:76 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:92 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:108 +; GFX942-GISEL-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:124 +; GFX942-GISEL-NEXT: flat_load_dwordx3 v[36:38], v[0:1] offset:140 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:48 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:80 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[28:31] offset:96 +; GFX942-GISEL-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:112 +; GFX942-GISEL-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX11-GISEL-NEXT: s_clause 0x8 +; GFX11-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 +; GFX11-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 +; GFX11-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 +; GFX11-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 +; GFX11-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 +; GFX11-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 +; GFX11-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 +; GFX11-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-GISEL-LABEL: flat_offset_inbounds_very_wide: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX12-GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-GISEL-NEXT: s_wait_alu 0xfffd +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-GISEL-NEXT: s_clause 0x8 +; GFX12-GISEL-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-GISEL-NEXT: flat_load_b128 v[8:11], v[0:1] offset:28 +; GFX12-GISEL-NEXT: flat_load_b128 v[12:15], v[0:1] offset:44 +; GFX12-GISEL-NEXT: flat_load_b128 v[16:19], v[0:1] offset:60 +; GFX12-GISEL-NEXT: flat_load_b128 v[20:23], v[0:1] offset:76 +; GFX12-GISEL-NEXT: flat_load_b128 v[24:27], v[0:1] offset:92 +; GFX12-GISEL-NEXT: flat_load_b128 v[28:31], v[0:1] offset:108 +; GFX12-GISEL-NEXT: flat_load_b128 v[32:35], v[0:1] offset:124 +; GFX12-GISEL-NEXT: flat_load_b96 v[36:38], v[0:1] offset:140 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x808 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x708 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[8:11] offset:16 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x608 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[12:15] offset:32 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x508 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[16:19] offset:48 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x408 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[20:23] offset:64 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x308 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[24:27] offset:80 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x208 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[28:31] offset:96 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x108 +; GFX12-GISEL-NEXT: flat_store_b128 v[2:3], v[32:35] offset:112 +; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-GISEL-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <35 x i32>, ptr %arrayidx + store <35 x i32> %l, ptr %pout + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} ; GFX10-GISEL-FLATSCR: {{.*}} ; GFX10-MUBUF: {{.*}} ; GFX10-SDAG-FLATSCR: {{.*}} -; GFX12: {{.*}} -; GFX90A: {{.*}} ; GFX90A-GISEL-FLATSCR: {{.*}} ; GFX90A-MUBUF: {{.*}} -; GFX90A-SDAG-FLATSCR: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index bd11b0710fadd..36df710529599 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -13,9 +13,9 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol @@ -40,9 +40,9 @@ define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, doub ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol @@ -71,11 +71,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: s_add_u32 s0, s0, -8 ; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1 +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 37c57ef57570e..cfa03402ef048 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -579,7 +579,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void @@ -665,7 +665,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) nounwind { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -729,9 +729,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %out.gep = getelementptr i32, ptr %out, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i32, ptr %out, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out.gep ret void @@ -784,8 +784,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -895,7 +895,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void @@ -987,7 +987,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -1054,9 +1054,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %out.gep = getelementptr i64, ptr %out, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i64, ptr %out, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out.gep ret void @@ -1112,8 +1112,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 2f4ecb8b0de92..cccd2449c3f01 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -1145,7 +1145,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void @@ -1231,7 +1231,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -1295,9 +1295,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %out.gep = getelementptr i32, ptr %out, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i32, ptr %out, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out.gep ret void @@ -1350,8 +1350,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -1533,7 +1533,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void @@ -1625,7 +1625,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -1692,9 +1692,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %out.gep = getelementptr i64, ptr %out, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i64, ptr %out, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out.gep ret void @@ -1750,8 +1750,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 4ad161c03f5b7..2ff69d234455f 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -12,18 +12,22 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-NEXT: s_mov_b32 s5, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-NEXT: .LBB0_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] ; GFX12-NEXT: s_cbranch_scc1 .LBB0_2 @@ -38,17 +42,20 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-SPREFETCH-NEXT: s_mov_b32 s5, -1 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe -; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -410,10 +417,14 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-NEXT: .LBB4_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX12-NEXT: s_add_co_i32 s0, s0, -1 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_cmp_lg_u32 s0, 0 @@ -448,10 +459,14 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-SPREFETCH-NEXT: .LBB4_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SPREFETCH-NEXT: v_add_co_u32 v4, vcc_lo, 0xffffff50, v2 +; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd +; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo ; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, v2, 16 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffd ; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s0, s0, -1 ; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s0, 0 @@ -466,15 +481,17 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; ; GFX1250-LABEL: copy_flat_divergent: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s2, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3 ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader ; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff50 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1] ; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1] @@ -482,13 +499,13 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB4_2: ; %for.body ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-NEXT: flat_load_b128 v[4:7], v[2:3] offset:-176 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[2:3] ; GFX1250-NEXT: flat_prefetch_b8 v[2:3] scope:SCOPE_SE -; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 16, v[2:3] -; GFX1250-NEXT: s_add_co_i32 s0, s0, -1 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-NEXT: s_add_co_i32 s2, s2, -1 +; GFX1250-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-NEXT: flat_load_b128 v[4:7], v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: flat_store_b128 v[0:1], v[4:7] ; GFX1250-NEXT: s_wait_xcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 06213ef3e06ea..d8b9f93ca8537 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -16,62 +16,71 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB0_1: ; %load-store-loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x4 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128 +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo +; CHECK-NEXT: s_clause 0xa +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %memcpy-split @@ -81,621 +90,656 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-LABEL: memcpy_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB0_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[55:58], v[66:67] +; ALIGNED-NEXT: flat_load_dwordx4 v[59:62], v[68:69] +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[70:71] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v63 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v63 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v72 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v72 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v33 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:33 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v52 offset:16 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v82 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 +; ALIGNED-NEXT: flat_store_byte v[24:25], v81 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v80 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v80 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:237 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:221 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:205 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:201 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:197 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:193 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:189 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:176 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:185 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:181 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:177 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:173 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v72 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -704,25 +748,34 @@ define void @memcpy_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB0_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x10 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -1549,39 +1602,41 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] -; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; CHECK-NEXT: s_cbranch_vccnz .LBB2_1 ; CHECK-NEXT: ; %bb.2: ; %memcpy-split @@ -1594,27 +1649,31 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB2_1: ; %load-store-loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 -; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 -; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 -; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 -; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[4:5], off offset:160 -; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[4:5], off offset:144 -; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[4:5], off offset:128 -; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[4:5], off offset:112 -; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[4:5], off offset:96 -; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[4:5], off offset:80 -; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[4:5], off offset:64 -; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[4:5], off offset:48 -; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 -; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 -; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[8:9], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[8:9], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[8:9], off offset:208 +; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[8:9], off offset:192 +; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[8:9], off offset:176 +; ALIGNED-NEXT: global_load_dwordx4 v[52:55], v[8:9], off offset:160 +; ALIGNED-NEXT: global_load_dwordx4 v[48:51], v[8:9], off offset:144 +; ALIGNED-NEXT: global_load_dwordx4 v[36:39], v[8:9], off offset:128 +; ALIGNED-NEXT: global_load_dwordx4 v[32:35], v[8:9], off offset:112 +; ALIGNED-NEXT: global_load_dwordx4 v[28:31], v[8:9], off offset:96 +; ALIGNED-NEXT: global_load_dwordx4 v[24:27], v[8:9], off offset:80 +; ALIGNED-NEXT: global_load_dwordx4 v[20:23], v[8:9], off offset:64 +; ALIGNED-NEXT: global_load_dwordx4 v[16:19], v[8:9], off offset:48 +; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[8:9], off offset:32 +; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[8:9], off +; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[8:9], off offset:16 +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) @@ -1622,466 +1681,465 @@ define void @memcpy_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4) ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 +; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 +; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_vccnz .LBB2_1 ; ALIGNED-NEXT: ; %bb.2: ; %memcpy-split ; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) @@ -3599,38 +3657,38 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: buffer_load_dword v13, v2, s[0:3], 0 offen offset:72 ; CHECK-NEXT: buffer_load_dword v12, v2, s[0:3], 0 offen offset:68 ; CHECK-NEXT: buffer_load_dword v11, v2, s[0:3], 0 offen offset:64 -; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:108 -; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:124 -; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:120 -; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:116 -; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:112 -; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:104 -; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:100 -; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:96 -; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:236 -; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:252 -; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:248 -; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:244 -; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:240 -; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:232 -; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:228 -; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:224 -; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:220 -; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:216 -; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:212 -; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:208 -; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:204 -; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:200 -; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:196 -; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:192 -; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:172 -; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:188 -; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:184 -; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:180 -; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:176 -; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:168 -; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:164 -; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:160 +; CHECK-NEXT: buffer_load_dword v22, v2, s[0:3], 0 offen offset:236 +; CHECK-NEXT: buffer_load_dword v26, v2, s[0:3], 0 offen offset:252 +; CHECK-NEXT: buffer_load_dword v25, v2, s[0:3], 0 offen offset:248 +; CHECK-NEXT: buffer_load_dword v24, v2, s[0:3], 0 offen offset:244 +; CHECK-NEXT: buffer_load_dword v23, v2, s[0:3], 0 offen offset:240 +; CHECK-NEXT: buffer_load_dword v21, v2, s[0:3], 0 offen offset:232 +; CHECK-NEXT: buffer_load_dword v20, v2, s[0:3], 0 offen offset:228 +; CHECK-NEXT: buffer_load_dword v19, v2, s[0:3], 0 offen offset:224 +; CHECK-NEXT: buffer_load_dword v30, v2, s[0:3], 0 offen offset:220 +; CHECK-NEXT: buffer_load_dword v29, v2, s[0:3], 0 offen offset:216 +; CHECK-NEXT: buffer_load_dword v28, v2, s[0:3], 0 offen offset:212 +; CHECK-NEXT: buffer_load_dword v27, v2, s[0:3], 0 offen offset:208 +; CHECK-NEXT: buffer_load_dword v34, v2, s[0:3], 0 offen offset:204 +; CHECK-NEXT: buffer_load_dword v33, v2, s[0:3], 0 offen offset:200 +; CHECK-NEXT: buffer_load_dword v32, v2, s[0:3], 0 offen offset:196 +; CHECK-NEXT: buffer_load_dword v31, v2, s[0:3], 0 offen offset:192 +; CHECK-NEXT: buffer_load_dword v38, v2, s[0:3], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v51, v2, s[0:3], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v50, v2, s[0:3], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v49, v2, s[0:3], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v48, v2, s[0:3], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v37, v2, s[0:3], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v36, v2, s[0:3], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v35, v2, s[0:3], 0 offen offset:96 +; CHECK-NEXT: buffer_load_dword v55, v2, s[0:3], 0 offen offset:188 +; CHECK-NEXT: buffer_load_dword v54, v2, s[0:3], 0 offen offset:184 +; CHECK-NEXT: buffer_load_dword v53, v2, s[0:3], 0 offen offset:180 +; CHECK-NEXT: buffer_load_dword v52, v2, s[0:3], 0 offen offset:176 +; CHECK-NEXT: buffer_load_dword v67, v2, s[0:3], 0 offen offset:172 +; CHECK-NEXT: buffer_load_dword v66, v2, s[0:3], 0 offen offset:168 +; CHECK-NEXT: buffer_load_dword v65, v2, s[0:3], 0 offen offset:164 +; CHECK-NEXT: buffer_load_dword v64, v2, s[0:3], 0 offen offset:160 ; CHECK-NEXT: buffer_load_dword v71, v2, s[0:3], 0 offen offset:156 ; CHECK-NEXT: buffer_load_dword v70, v2, s[0:3], 0 offen offset:152 ; CHECK-NEXT: buffer_load_dword v69, v2, s[0:3], 0 offen offset:148 @@ -3651,29 +3709,31 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(43) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:192 +; CHECK-NEXT: s_waitcnt vmcnt(40) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:176 +; CHECK-NEXT: s_waitcnt vmcnt(36) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:208 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:144 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -3756,21 +3816,21 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:31 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:32 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:34 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:35 ; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:36 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:39 ; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:42 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:43 ; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:49 ; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 @@ -3779,12 +3839,12 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:53 ; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:60 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:65 @@ -3835,20 +3895,20 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19 ; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 ; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 +; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17 ; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22 ; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27 ; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v6, 16, v5 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v8, 16, v7 @@ -3861,20 +3921,20 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 @@ -3898,7 +3958,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:86 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 16, v3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) @@ -3909,22 +3969,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill @@ -3933,11 +3993,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill @@ -4205,7 +4265,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -4236,17 +4296,17 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 @@ -4261,302 +4321,309 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v120, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:164 ; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v104, 8, v108 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v105 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:181 ; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:188 ; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:191 ; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: s_clause 0x3c +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:193 ; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:196 ; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:205 ; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:207 ; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:217 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:220 ; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:226 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229 ; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 ; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 ; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 ; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:244 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:252 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: s_waitcnt vmcnt(59) +; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v100, 8, v101 +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v97 +; ALIGNED-NEXT: s_waitcnt vmcnt(49) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v107, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v79, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v84 +; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v1, 16, v0 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v14, 8, v18 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v10, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v72, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v70 ; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen ; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:4 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 -; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v53 ; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 -; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 -; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 -; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v114, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v38, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v98, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v81, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v29 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 -; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v22 +; ALIGNED-NEXT: v_lshl_or_b32 v48, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v16, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v6, 16, v5 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v7, 8, v8 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshl_or_b32 v89, v9, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v89, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v90, v90, 8, v121 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v111, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v121 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v109, 8, v106 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v122, 8, v111 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:15 ; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 -; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshl_or_b32 v89, v127, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v90, v106, 8, v90 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:9 ; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v111, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v121, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v90, 16, v89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v126, v73, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v89, 8, v106 ; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4565,84 +4632,86 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v0, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:224 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v3, 3 +; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v4, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:247 +; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:246 +; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:252 +; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:250 +; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:251 +; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:249 +; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:245 +; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:239 +; ALIGNED-NEXT: flat_store_byte v[5:6], v16 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:238 +; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:244 +; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:241 +; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:237 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:231 +; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:232 +; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:230 +; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:236 +; ALIGNED-NEXT: flat_store_byte v[5:6], v27 offset:234 +; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:235 +; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:233 +; ALIGNED-NEXT: flat_store_byte v[5:6], v24 offset:229 +; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:223 +; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:222 +; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:228 +; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:221 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:210 +; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:206 +; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:207 +; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:211 +; ALIGNED-NEXT: flat_store_byte v[5:6], v80 offset:209 +; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:215 +; ALIGNED-NEXT: flat_store_byte v[5:6], v52 offset:216 +; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:214 +; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:220 +; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:218 +; ALIGNED-NEXT: flat_store_byte v[5:6], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:217 +; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:205 +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:199 +; ALIGNED-NEXT: flat_store_byte v[5:6], v85 offset:200 +; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:198 +; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:204 +; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:203 +; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:201 +; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:197 +; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:191 +; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:192 +; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:190 +; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:194 +; ALIGNED-NEXT: flat_store_byte v[5:6], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:189 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -4656,22 +4725,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:183 +; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:184 +; ALIGNED-NEXT: flat_store_byte v[5:6], v118 offset:182 +; ALIGNED-NEXT: flat_store_byte v[5:6], v41 offset:188 +; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:186 +; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:187 +; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:185 +; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:181 +; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:175 +; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:174 +; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:180 +; ALIGNED-NEXT: flat_store_byte v[5:6], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:179 +; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:173 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 @@ -4684,22 +4753,22 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:167 +; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:166 +; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:172 +; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:170 +; ALIGNED-NEXT: flat_store_byte v[5:6], v76 offset:171 +; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:169 +; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:165 +; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v92 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v91 offset:164 +; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:163 +; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:161 +; ALIGNED-NEXT: flat_store_byte v[5:6], v108 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 @@ -4712,44 +4781,44 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v125 offset:156 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 @@ -4764,49 +4833,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 @@ -4824,52 +4893,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 @@ -4884,52 +4953,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 @@ -4944,52 +5013,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:88 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:92 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:90 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:89 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 @@ -5004,49 +5073,49 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 @@ -5064,52 +5133,52 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:57 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:53 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:46 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:52 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 @@ -5124,56 +5193,56 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload @@ -5182,46 +5251,46 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:18 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:15 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:17 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:16 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 @@ -5231,42 +5300,42 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 +; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:7 +; ALIGNED-NEXT: flat_store_byte v[5:6], v121 offset:8 +; ALIGNED-NEXT: flat_store_byte v[5:6], v127 offset:10 +; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:9 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:15 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9 ; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload @@ -5399,62 +5468,68 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: .LBB5_2: ; %memmove_fwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v52, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v53, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[52:53] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[52:53] +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v52, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v53, vcc_lo +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v52, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v53, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v52 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v53, vcc_lo +; CHECK-NEXT: s_clause 0xd +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[52:53] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[52:53] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[52:53] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo +; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_lg_u64 s[4:5], 0x800 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_cbranch_scc1 .LBB5_2 ; CHECK-NEXT: .LBB5_3: ; %Flow5 ; CHECK-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -5465,62 +5540,71 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; CHECK-NEXT: s_mov_b32 s7, -1 ; CHECK-NEXT: .LBB5_5: ; %memmove_bwd_loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v2, s4 -; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v3, vcc_lo +; CHECK-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 +; CHECK-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo +; CHECK-NEXT: s_clause 0x4 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[24:25] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[24:25] +; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:128 +; CHECK-NEXT: v_add_co_u32 v80, vcc_lo, v24, 0x60 +; CHECK-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v96, vcc_lo, v24, 48 +; CHECK-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v25, vcc_lo +; CHECK-NEXT: v_add_co_u32 v84, vcc_lo, 0x50, v24 +; CHECK-NEXT: v_add_co_ci_u32_e64 v85, null, 0, v25, vcc_lo +; CHECK-NEXT: s_clause 0xa +; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[80:81] offset:128 +; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[80:81] offset:144 +; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[80:81] offset:96 +; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[80:81] offset:112 +; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[80:81] offset:64 +; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[80:81] offset:80 +; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[80:81] offset:48 +; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[80:81] +; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[80:81] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[84:85] +; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[96:97] offset:224 -; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[96:97] offset:240 -; CHECK-NEXT: flat_load_dwordx4 v[12:15], v[96:97] offset:192 -; CHECK-NEXT: flat_load_dwordx4 v[16:19], v[96:97] offset:208 -; CHECK-NEXT: flat_load_dwordx4 v[20:23], v[96:97] offset:160 -; CHECK-NEXT: flat_load_dwordx4 v[24:27], v[96:97] offset:176 -; CHECK-NEXT: flat_load_dwordx4 v[28:31], v[96:97] offset:128 -; CHECK-NEXT: flat_load_dwordx4 v[32:35], v[96:97] offset:144 -; CHECK-NEXT: flat_load_dwordx4 v[36:39], v[96:97] offset:96 -; CHECK-NEXT: flat_load_dwordx4 v[48:51], v[96:97] offset:112 -; CHECK-NEXT: flat_load_dwordx4 v[52:55], v[96:97] offset:64 -; CHECK-NEXT: flat_load_dwordx4 v[64:67], v[96:97] offset:80 -; CHECK-NEXT: flat_load_dwordx4 v[68:71], v[96:97] offset:32 -; CHECK-NEXT: flat_load_dwordx4 v[80:83], v[96:97] offset:48 -; CHECK-NEXT: flat_load_dwordx4 v[84:87], v[96:97] -; CHECK-NEXT: flat_load_dwordx4 v[96:99], v[96:97] offset:16 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 ; CHECK-NEXT: s_addc_u32 s5, s5, -1 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] ; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[32:35] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[84:87] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[84:87] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 -; CHECK-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[96:99] ; CHECK-NEXT: s_cbranch_scc0 .LBB5_5 ; CHECK-NEXT: .LBB5_6: ; %Flow6 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -5530,14 +5614,23 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-LABEL: memmove_p0_p0_sz2048: ; ALIGNED: ; %bb.0: ; %entry ; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_mov_b32 s4, exec_lo ; ALIGNED-NEXT: v_cmpx_ge_u64_e64 v[2:3], v[0:1] ; ALIGNED-NEXT: s_xor_b32 s6, exec_lo, s4 @@ -5546,609 +5639,635 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0 ; ALIGNED-NEXT: .LBB5_2: ; %memmove_fwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[20:21] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[22:25], v[20:21] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[20:21] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[20:21] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[20:21] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[98:101], v[20:21] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[20:21] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[20:21] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[20:21] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[20:21] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[20:21] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[50:53], v[20:21] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[20:21] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[20:21] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[20:21] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[20:21] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:76 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:72 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v38, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v39, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v80, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v81, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[34:37], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[82:85], v[38:39] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[70:71] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[80:81] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:172 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v56 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v56 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v57 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v57 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:44 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:40 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:208 -; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:192 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:328 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:332 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v27 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:33 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:176 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:272 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v30 offset:16 +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:156 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:152 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:288 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:9 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:168 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v36 offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:164 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 +; ALIGNED-NEXT: flat_store_byte v[24:25], v35 offset:4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:160 -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v34 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v34 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:108 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:100 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:144 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:124 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:112 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:237 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:124 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:120 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:116 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:128 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:140 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:136 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:132 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:221 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:204 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:112 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:92 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:88 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:205 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:96 +; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:216 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:108 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:104 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:100 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:108 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:201 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:104 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:197 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:193 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:96 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:189 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:188 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:160 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:80 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:185 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v80 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:181 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:173 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:64 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:156 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:152 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:148 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v57 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v57 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v34 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:60 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:56 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v29 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v26 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v33 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v32 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v31 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v30 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:233 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v35 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v49 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v39 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v38 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v53 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v52 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v65 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v64 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v54 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v66 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v80 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v71 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v70 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v85 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:185 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v84 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 24, v82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v82 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 24, v97 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v96 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 24, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v86 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6157,14 +6276,14 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB5_2 ; ALIGNED-NEXT: .LBB5_3: ; %Flow5 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -6175,609 +6294,635 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB5_5: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v2, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v3, vcc_lo -; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[24:25] offset:240 -; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[24:25] offset:224 -; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[24:25] -; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[24:25] offset:16 -; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[24:25] offset:32 -; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] offset:48 -; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[24:25] offset:64 -; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[24:25] offset:80 -; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[24:25] offset:96 -; ALIGNED-NEXT: flat_load_dwordx4 v[32:35], v[24:25] offset:112 -; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[24:25] offset:128 -; ALIGNED-NEXT: flat_load_dwordx4 v[52:55], v[24:25] offset:144 -; ALIGNED-NEXT: flat_load_dwordx4 v[66:69], v[24:25] offset:160 -; ALIGNED-NEXT: flat_load_dwordx4 v[81:84], v[24:25] offset:176 -; ALIGNED-NEXT: flat_load_dwordx4 v[96:99], v[24:25] offset:192 -; ALIGNED-NEXT: flat_load_dwordx4 v[100:103], v[24:25] offset:208 -; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:324 -; ALIGNED-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v6, vcc_lo, v4, 48 +; ALIGNED-NEXT: flat_load_dwordx4 v[20:23], v[4:5] offset:128 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v8, vcc_lo, 0x50, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v12, vcc_lo, 0x60, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v16, vcc_lo, 0x70, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, 0x90, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v52, vcc_lo, 0xa0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v53, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v54, vcc_lo, 0xb0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v55, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v64, vcc_lo, 0xc0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v65, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v66, vcc_lo, 0xd0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v67, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v68, vcc_lo, 0xe0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v69, null, 0, v5, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v70, vcc_lo, 0xf0, v4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v71, null, 0, v5, vcc_lo +; ALIGNED-NEXT: s_clause 0xe +; ALIGNED-NEXT: flat_load_dwordx4 v[26:29], v[4:5] +; ALIGNED-NEXT: flat_load_dwordx4 v[35:38], v[4:5] offset:16 +; ALIGNED-NEXT: flat_load_dwordx4 v[30:33], v[4:5] offset:32 +; ALIGNED-NEXT: flat_load_dwordx4 v[48:51], v[4:5] offset:64 +; ALIGNED-NEXT: flat_load_dwordx4 v[4:7], v[6:7] +; ALIGNED-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; ALIGNED-NEXT: flat_load_dwordx4 v[12:15], v[12:13] +; ALIGNED-NEXT: flat_load_dwordx4 v[16:19], v[16:17] +; ALIGNED-NEXT: flat_load_dwordx4 v[112:115], v[24:25] +; ALIGNED-NEXT: flat_load_dwordx4 v[116:119], v[52:53] +; ALIGNED-NEXT: flat_load_dwordx4 v[40:43], v[54:55] +; ALIGNED-NEXT: flat_load_dwordx4 v[44:47], v[64:65] +; ALIGNED-NEXT: flat_load_dwordx4 v[55:58], v[66:67] +; ALIGNED-NEXT: flat_load_dwordx4 v[59:62], v[68:69] +; ALIGNED-NEXT: flat_load_dwordx4 v[68:71], v[70:71] +; ALIGNED-NEXT: v_add_co_u32 v24, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v25, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v31 offset:254 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:252 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v30 offset:250 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:248 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v25 offset:246 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:244 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v24 offset:242 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:240 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(22) -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:348 -; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:344 -; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:340 -; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_load_dword v102, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v103, off, s[0:3], s32 offset:424 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:420 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 offset:416 +; ALIGNED-NEXT: v_add_co_u32 v22, vcc_lo, v24, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v25, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v20, vcc_lo, v24, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v25, vcc_lo ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v51 offset:238 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:236 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v102 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:137 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v50 offset:234 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:232 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v103 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:133 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v49 offset:230 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v63 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v63 offset:129 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v36 offset:226 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:224 -; ALIGNED-NEXT: s_waitcnt lgkmcnt(16) -; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:300 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v72 offset:124 +; ALIGNED-NEXT: flat_store_byte v[24:25], v72 offset:128 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(19) +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v71 offset:222 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:220 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v70 offset:218 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:216 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:212 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v64 offset:210 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:208 -; ALIGNED-NEXT: buffer_store_dword v96, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: buffer_store_dword v97, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:316 +; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v101 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:73 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v100 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:69 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v99 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:65 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v98 offset:60 +; ALIGNED-NEXT: flat_store_byte v[24:25], v98 offset:64 +; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:588 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v87 offset:206 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:204 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v86 offset:202 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:200 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v85 offset:198 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:196 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v80 offset:194 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:192 -; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 +; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:588 +; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:576 +; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:580 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v39 offset:36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:37 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v34 offset:40 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:41 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v33 offset:28 +; ALIGNED-NEXT: flat_store_byte v[24:25], v33 offset:32 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v32 offset:32 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:33 +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:540 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v101, off, s[0:3], s32 offset:396 -; ALIGNED-NEXT: buffer_load_dword v99, off, s[0:3], s32 offset:392 -; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v101 offset:190 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:188 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v99 offset:186 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:184 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v96 offset:182 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:180 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v81 offset:178 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:176 -; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:536 +; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:528 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v67 offset:24 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:25 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v66 offset:20 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:21 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v53 offset:16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:17 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v52 offset:12 +; ALIGNED-NEXT: flat_store_byte v[24:25], v52 offset:16 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v100, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v100 offset:174 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:172 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v97 offset:170 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:168 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v82 offset:166 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:164 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v66 offset:162 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:160 -; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:556 +; ALIGNED-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v86 offset:8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v82 offset:4 +; ALIGNED-NEXT: flat_store_byte v[24:25], v82 offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v81 +; ALIGNED-NEXT: flat_store_byte v[24:25], v81 offset:4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[24:25], v80 offset:2 +; ALIGNED-NEXT: flat_store_byte v[24:25], v80 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(40) +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v98, off, s[0:3], s32 offset:364 -; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:360 -; ALIGNED-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:356 -; ALIGNED-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v98 offset:158 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:156 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v83 offset:154 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:152 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v67 offset:150 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:148 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v52 offset:146 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:144 -; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v97, off, s[0:3], s32 offset:380 +; ALIGNED-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:376 +; ALIGNED-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:372 +; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:368 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v97 offset:248 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:249 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v87 offset:244 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:245 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v83 offset:240 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:241 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v68 offset:236 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:237 +; ALIGNED-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:380 -; ALIGNED-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:376 -; ALIGNED-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:372 -; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v84 offset:142 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:140 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v68 offset:138 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v53 offset:134 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:132 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v37 offset:130 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:128 -; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v96, off, s[0:3], s32 offset:396 +; ALIGNED-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:392 +; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:388 +; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v96 offset:232 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:233 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v84 offset:228 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:229 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v69 offset:224 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:225 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v54 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:221 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:456 -; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v69 offset:126 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:124 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v54 offset:122 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:120 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v38 offset:118 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:116 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:112 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:348 +; ALIGNED-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:344 +; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:340 +; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v85 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:217 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v70 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:213 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v55 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:209 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v48 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:205 +; ALIGNED-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:476 -; ALIGNED-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v55 offset:110 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:108 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v39 offset:106 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:104 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v33 offset:102 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:100 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v26 offset:98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:96 -; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:428 +; ALIGNED-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:364 +; ALIGNED-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:360 +; ALIGNED-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:356 +; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:352 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:201 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v64 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:197 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v49 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:193 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v35 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:189 +; ALIGNED-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 -; ALIGNED-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:424 -; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 -; ALIGNED-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v48 offset:94 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:92 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v34 offset:90 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:88 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v27 offset:86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:84 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v21 offset:82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:80 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v65 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:185 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v50 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:181 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v36 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:177 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v29 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:173 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v35 offset:78 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:76 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v28 offset:74 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:72 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v22 offset:70 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:68 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v19 offset:66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:64 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v51 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:169 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v37 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:165 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v30 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v27 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:157 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v29 offset:62 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:60 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v23 offset:58 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:56 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v20 offset:54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:52 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v18 offset:50 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:48 -; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 -; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:540 +; ALIGNED-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:412 +; ALIGNED-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:408 +; ALIGNED-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:404 +; ALIGNED-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:400 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v102 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v103 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:140 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v63 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:138 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v72 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:136 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:134 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:132 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:130 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:128 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:126 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v38 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:153 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v31 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:149 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v28 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:145 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v26 offset:140 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:141 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:536 -; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 -; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:528 -; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:532 +; ALIGNED-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 8, v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v15 offset:42 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v19 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:121 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v14 offset:46 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:44 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v18 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:117 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v13 offset:34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v17 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:113 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v12 offset:38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:36 -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v16 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:109 +; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:524 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:28 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:24 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:20 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:16 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:520 +; ALIGNED-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v15 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:105 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v14 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:101 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v13 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:97 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v12 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:93 +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v49 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v36 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v64 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v67 +; ALIGNED-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:72 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:68 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:66 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:62 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v11 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:89 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v10 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:85 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v9 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:81 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v8 offset:76 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:77 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:568 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v67 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:243 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:239 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v85 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v96 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v52 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[24:25], v114 offset:1 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v81 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v97 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v97 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 8, v97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v82 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 8, v82 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:40 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v83 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:211 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v84 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:149 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v8 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:44 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v68 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v68 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:207 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v37 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v69 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:42 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v96 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:32 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v69 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v54 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 24, v32 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v85 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v70 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 8, v8 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v48 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v34 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v21, 8, v21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:179 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 8, v64 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v35 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v65 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v65 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 24, v50 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 8, v50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:10 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v51 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:4 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:2 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v27 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:252 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:248 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v28 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 8, v28 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v22, 8, v22 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v19 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:246 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 24, v19 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v19, 8, v19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v29 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v97, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v23, 8, v23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v20, 8, v20 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:242 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v18 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v18, 8, v18 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:240 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v17, 8, v17 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:238 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v15 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v96, 24, v14 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v14, 8, v14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:159 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v13 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v13 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:157 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v12 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v12 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v12, 8, v12 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:155 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 24, v11 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 8, v11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:153 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v10 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v10, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:151 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 24, v9 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:224 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v9 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v65 offset:147 -; ALIGNED-NEXT: flat_store_byte v[16:17], v52 offset:145 -; ALIGNED-NEXT: flat_store_byte v[16:17], v25 offset:143 -; ALIGNED-NEXT: flat_store_byte v[16:17], v84 offset:141 -; ALIGNED-NEXT: flat_store_byte v[16:17], v64 offset:139 -; ALIGNED-NEXT: flat_store_byte v[16:17], v68 offset:137 -; ALIGNED-NEXT: flat_store_byte v[16:17], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[16:17], v53 offset:133 -; ALIGNED-NEXT: flat_store_byte v[16:17], v87 offset:131 -; ALIGNED-NEXT: flat_store_byte v[16:17], v37 offset:129 -; ALIGNED-NEXT: flat_store_byte v[16:17], v24 offset:127 -; ALIGNED-NEXT: flat_store_byte v[16:17], v69 offset:125 -; ALIGNED-NEXT: flat_store_byte v[16:17], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[16:17], v54 offset:121 -; ALIGNED-NEXT: flat_store_byte v[16:17], v114 offset:119 -; ALIGNED-NEXT: flat_store_byte v[16:17], v38 offset:117 -; ALIGNED-NEXT: flat_store_byte v[16:17], v85 offset:115 -; ALIGNED-NEXT: flat_store_byte v[16:17], v32 offset:113 -; ALIGNED-NEXT: flat_store_byte v[16:17], v51 offset:111 -; ALIGNED-NEXT: flat_store_byte v[16:17], v55 offset:109 -; ALIGNED-NEXT: flat_store_byte v[16:17], v80 offset:107 -; ALIGNED-NEXT: flat_store_byte v[16:17], v39 offset:105 -; ALIGNED-NEXT: flat_store_byte v[16:17], v115 offset:103 -; ALIGNED-NEXT: flat_store_byte v[16:17], v33 offset:101 -; ALIGNED-NEXT: flat_store_byte v[16:17], v101 offset:99 -; ALIGNED-NEXT: flat_store_byte v[16:17], v26 offset:97 -; ALIGNED-NEXT: flat_store_byte v[16:17], v50 offset:95 -; ALIGNED-NEXT: flat_store_byte v[16:17], v48 offset:93 -; ALIGNED-NEXT: flat_store_byte v[16:17], v99 offset:91 -; ALIGNED-NEXT: flat_store_byte v[16:17], v34 offset:89 -; ALIGNED-NEXT: flat_store_byte v[16:17], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte v[16:17], v27 offset:85 -; ALIGNED-NEXT: flat_store_byte v[16:17], v96 offset:83 -; ALIGNED-NEXT: flat_store_byte v[16:17], v21 offset:81 -; ALIGNED-NEXT: flat_store_byte v[16:17], v49 offset:79 -; ALIGNED-NEXT: flat_store_byte v[16:17], v35 offset:77 -; ALIGNED-NEXT: flat_store_byte v[16:17], v81 offset:75 -; ALIGNED-NEXT: flat_store_byte v[16:17], v28 offset:73 -; ALIGNED-NEXT: flat_store_byte v[16:17], v31 offset:71 -; ALIGNED-NEXT: flat_store_byte v[16:17], v22 offset:69 -; ALIGNED-NEXT: flat_store_byte v[16:17], v100 offset:67 -; ALIGNED-NEXT: flat_store_byte v[16:17], v19 offset:65 -; ALIGNED-NEXT: flat_store_byte v[16:17], v36 offset:63 -; ALIGNED-NEXT: flat_store_byte v[16:17], v29 offset:61 -; ALIGNED-NEXT: flat_store_byte v[16:17], v97 offset:59 -; ALIGNED-NEXT: flat_store_byte v[16:17], v23 offset:57 -; ALIGNED-NEXT: flat_store_byte v[16:17], v103 offset:55 -; ALIGNED-NEXT: flat_store_byte v[16:17], v20 offset:53 -; ALIGNED-NEXT: flat_store_byte v[16:17], v82 offset:51 -; ALIGNED-NEXT: flat_store_byte v[16:17], v18 offset:49 -; ALIGNED-NEXT: flat_store_byte v[16:17], v71 offset:43 -; ALIGNED-NEXT: flat_store_byte v[16:17], v15 offset:41 -; ALIGNED-NEXT: flat_store_byte v[16:17], v66 offset:47 -; ALIGNED-NEXT: flat_store_byte v[16:17], v14 offset:45 -; ALIGNED-NEXT: flat_store_byte v[16:17], v30 offset:35 -; ALIGNED-NEXT: flat_store_byte v[16:17], v13 offset:33 -; ALIGNED-NEXT: flat_store_byte v[16:17], v98 offset:39 -; ALIGNED-NEXT: flat_store_byte v[16:17], v12 offset:37 -; ALIGNED-NEXT: flat_store_byte v[16:17], v70 offset:31 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:29 -; ALIGNED-NEXT: flat_store_byte v[16:17], v83 offset:27 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:25 -; ALIGNED-NEXT: flat_store_byte v[16:17], v112 offset:23 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:21 -; ALIGNED-NEXT: flat_store_byte v[16:17], v67 offset:19 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:17 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:12 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:8 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte_d16_hi v[16:17], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 +; ALIGNED-NEXT: flat_store_byte v[20:21], v32 offset:220 +; ALIGNED-NEXT: flat_store_byte v[20:21], v85 offset:218 +; ALIGNED-NEXT: flat_store_byte v[20:21], v102 offset:216 +; ALIGNED-NEXT: flat_store_byte v[20:21], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[20:21], v67 offset:212 +; ALIGNED-NEXT: flat_store_byte v[20:21], v55 offset:210 +; ALIGNED-NEXT: flat_store_byte v[20:21], v103 offset:208 +; ALIGNED-NEXT: flat_store_byte v[20:21], v48 offset:206 +; ALIGNED-NEXT: flat_store_byte v[20:21], v66 offset:204 +; ALIGNED-NEXT: flat_store_byte v[20:21], v71 offset:202 +; ALIGNED-NEXT: flat_store_byte v[20:21], v112 offset:200 +; ALIGNED-NEXT: flat_store_byte v[20:21], v64 offset:198 +; ALIGNED-NEXT: flat_store_byte v[20:21], v53 offset:196 +; ALIGNED-NEXT: flat_store_byte v[20:21], v49 offset:194 +; ALIGNED-NEXT: flat_store_byte v[20:21], v113 offset:192 +; ALIGNED-NEXT: flat_store_byte v[20:21], v35 offset:190 +; ALIGNED-NEXT: flat_store_byte v[20:21], v52 offset:188 +; ALIGNED-NEXT: flat_store_byte v[20:21], v65 offset:186 +; ALIGNED-NEXT: flat_store_byte v[20:21], v115 offset:184 +; ALIGNED-NEXT: flat_store_byte v[20:21], v50 offset:182 +; ALIGNED-NEXT: flat_store_byte v[20:21], v86 offset:180 +; ALIGNED-NEXT: flat_store_byte v[20:21], v36 offset:178 +; ALIGNED-NEXT: flat_store_byte v[20:21], v24 offset:176 +; ALIGNED-NEXT: flat_store_byte v[20:21], v29 offset:174 +; ALIGNED-NEXT: flat_store_byte v[20:21], v25 offset:172 +; ALIGNED-NEXT: flat_store_byte v[20:21], v51 offset:170 +; ALIGNED-NEXT: flat_store_byte v[20:21], v82 offset:168 +; ALIGNED-NEXT: flat_store_byte v[20:21], v37 offset:166 +; ALIGNED-NEXT: flat_store_byte v[20:21], v81 offset:164 +; ALIGNED-NEXT: flat_store_byte v[20:21], v30 offset:162 +; ALIGNED-NEXT: flat_store_byte v[20:21], v80 offset:160 +; ALIGNED-NEXT: flat_store_byte v[20:21], v27 offset:158 +; ALIGNED-NEXT: flat_store_byte v[20:21], v114 offset:156 +; ALIGNED-NEXT: flat_store_byte v[20:21], v38 offset:154 +; ALIGNED-NEXT: flat_store_byte v[20:21], v97 offset:152 +; ALIGNED-NEXT: flat_store_byte v[20:21], v31 offset:150 +; ALIGNED-NEXT: flat_store_byte v[20:21], v98 offset:148 +; ALIGNED-NEXT: flat_store_byte v[20:21], v28 offset:146 +; ALIGNED-NEXT: flat_store_byte v[20:21], v87 offset:144 +; ALIGNED-NEXT: flat_store_byte v[20:21], v26 offset:142 +; ALIGNED-NEXT: flat_store_byte v[20:21], v39 offset:124 +; ALIGNED-NEXT: flat_store_byte v[20:21], v19 offset:122 +; ALIGNED-NEXT: flat_store_byte v[20:21], v83 offset:120 +; ALIGNED-NEXT: flat_store_byte v[20:21], v18 offset:118 +; ALIGNED-NEXT: flat_store_byte v[20:21], v99 offset:116 +; ALIGNED-NEXT: flat_store_byte v[20:21], v17 offset:114 +; ALIGNED-NEXT: flat_store_byte v[20:21], v68 offset:112 +; ALIGNED-NEXT: flat_store_byte v[20:21], v16 offset:110 +; ALIGNED-NEXT: flat_store_byte v[20:21], v34 offset:108 +; ALIGNED-NEXT: flat_store_byte v[20:21], v15 offset:106 +; ALIGNED-NEXT: flat_store_byte v[20:21], v96 offset:104 +; ALIGNED-NEXT: flat_store_byte v[20:21], v14 offset:102 +; ALIGNED-NEXT: flat_store_byte v[20:21], v100 offset:100 +; ALIGNED-NEXT: flat_store_byte v[20:21], v13 offset:98 +; ALIGNED-NEXT: flat_store_byte v[20:21], v84 offset:96 +; ALIGNED-NEXT: flat_store_byte v[20:21], v12 offset:94 +; ALIGNED-NEXT: flat_store_byte v[20:21], v33 offset:92 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:90 +; ALIGNED-NEXT: flat_store_byte v[20:21], v69 offset:88 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:86 +; ALIGNED-NEXT: flat_store_byte v[20:21], v101 offset:84 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:82 +; ALIGNED-NEXT: flat_store_byte v[20:21], v54 offset:80 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:78 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v7 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:57 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v6 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:53 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v5 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:49 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte_d16_hi v[22:23], v4 offset:44 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:45 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v8, 24, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v7, 8, v7 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v9, 24, v6 @@ -6786,26 +6931,35 @@ define void @memmove_p0_p0_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(0 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v5 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v4 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[16:17], v8 offset:15 -; ALIGNED-NEXT: flat_store_byte v[16:17], v7 offset:13 -; ALIGNED-NEXT: flat_store_byte v[16:17], v9 offset:11 -; ALIGNED-NEXT: flat_store_byte v[16:17], v6 offset:9 -; ALIGNED-NEXT: flat_store_byte v[16:17], v10 offset:7 -; ALIGNED-NEXT: flat_store_byte v[16:17], v5 offset:5 -; ALIGNED-NEXT: flat_store_byte v[16:17], v11 offset:3 -; ALIGNED-NEXT: flat_store_byte v[16:17], v4 offset:1 +; ALIGNED-NEXT: flat_store_byte v[20:21], v8 offset:60 +; ALIGNED-NEXT: flat_store_byte v[20:21], v7 offset:58 +; ALIGNED-NEXT: flat_store_byte v[20:21], v9 offset:56 +; ALIGNED-NEXT: flat_store_byte v[20:21], v6 offset:54 +; ALIGNED-NEXT: flat_store_byte v[20:21], v10 offset:52 +; ALIGNED-NEXT: flat_store_byte v[20:21], v5 offset:50 +; ALIGNED-NEXT: flat_store_byte v[20:21], v11 offset:48 +; ALIGNED-NEXT: flat_store_byte v[20:21], v4 offset:46 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB5_5 ; ALIGNED-NEXT: .LBB5_6: ; %Flow6 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 -; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: s_clause 0x10 +; ALIGNED-NEXT: buffer_load_dword v72, off, s[0:3], s32 +; ALIGNED-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; ALIGNED-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 +; ALIGNED-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 +; ALIGNED-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 +; ALIGNED-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 +; ALIGNED-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; ALIGNED-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; ALIGNED-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 +; ALIGNED-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; ALIGNED-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 +; ALIGNED-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 +; ALIGNED-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 +; ALIGNED-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; ALIGNED-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 +; ALIGNED-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; ALIGNED-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -8422,34 +8576,36 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 ; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) @@ -8488,34 +8644,36 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; CHECK-NEXT: global_load_dwordx4 v[80:83], v[96:97], off offset:32 ; CHECK-NEXT: global_load_dwordx4 v[84:87], v[96:97], off offset:16 ; CHECK-NEXT: global_load_dwordx4 v[96:99], v[96:97], off +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[4:7] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[4:7] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[8:11] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[8:11] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[12:15] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[12:15] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[16:19] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[16:19] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[20:23] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[20:23] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[24:27] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[24:27] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[28:31] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[28:31] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[32:35] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[36:39] offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[36:39] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(4) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] ; CHECK-NEXT: s_waitcnt vmcnt(2) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) @@ -8542,11 +8700,11 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf -; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -8561,473 +8719,477 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) -; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:168 -; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:172 -; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:164 -; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: buffer_store_dword v100, off, s[0:3], s32 offset:168 +; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:172 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:164 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:160 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v100 offset:244 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:254 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:250 +; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:244 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:252 +; ALIGNED-NEXT: flat_store_byte v[96:97], v117 offset:242 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:248 +; ALIGNED-NEXT: flat_store_byte v[96:97], v118 offset:240 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:246 +; ALIGNED-NEXT: flat_store_byte v[96:97], v119 offset:238 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:240 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:184 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:188 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:180 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:176 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:221 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:243 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:184 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:188 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:180 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:176 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:237 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:222 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:136 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:140 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:132 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:128 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:152 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:156 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:148 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:144 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v53 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:236 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:228 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:224 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:188 ; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:186 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:180 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:178 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:176 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:174 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:248 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:244 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:167 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:161 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:200 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:196 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:141 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:146 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:144 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:142 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:220 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:212 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:126 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:48 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:8 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:12 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:4 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v4 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:87 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v7 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:92 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:90 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:84 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:78 +; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:77 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:24 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:28 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:20 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:16 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:62 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:116 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:68 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:84 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:12 +; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[86:87], v5 +; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[96:97], v98 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v99 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB7_2 ; ALIGNED-NEXT: .LBB7_3: ; %Flow6 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 @@ -9040,11 +9202,11 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 ; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, s4 ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, s5, v3, vcc_lo -; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v0, s4 -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, s5, v1, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v84, vcc_lo, v0, s4 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v85, null, s5, v1, vcc_lo ; ALIGNED-NEXT: s_clause 0xf ; ALIGNED-NEXT: global_load_dwordx4 v[98:101], v[4:5], off offset:240 -; ALIGNED-NEXT: global_load_dwordx4 v[84:87], v[4:5], off offset:224 +; ALIGNED-NEXT: global_load_dwordx4 v[112:115], v[4:5], off offset:224 ; ALIGNED-NEXT: global_load_dwordx4 v[80:83], v[4:5], off offset:208 ; ALIGNED-NEXT: global_load_dwordx4 v[68:71], v[4:5], off offset:192 ; ALIGNED-NEXT: global_load_dwordx4 v[64:67], v[4:5], off offset:176 @@ -9059,6 +9221,10 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: global_load_dwordx4 v[12:15], v[4:5], off offset:32 ; ALIGNED-NEXT: global_load_dwordx4 v[8:11], v[4:5], off offset:16 ; ALIGNED-NEXT: global_load_dwordx4 v[4:7], v[4:5], off +; ALIGNED-NEXT: v_add_co_u32 v96, vcc_lo, v84, 6 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v97, null, 0, v85, vcc_lo +; ALIGNED-NEXT: v_add_co_u32 v86, vcc_lo, v84, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v87, null, 0, v85, vcc_lo ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_waitcnt vmcnt(15) @@ -9066,465 +9232,465 @@ define void @memmove_p0_p4_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(4 ; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:428 ; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:420 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:250 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:254 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:252 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:248 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:246 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:244 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:242 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:240 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v100 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v100 offset:244 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v101 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:249 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:245 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v99 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:241 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v98 offset:236 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:237 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v100 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v98 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v101 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v100 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 24, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v117, 8, v99 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v118, 24, v98 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v119, 8, v98 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:248 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:246 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:252 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v86 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:251 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:249 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:255 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:253 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:247 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:245 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v84 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:243 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:250 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v114 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v115 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v113 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v113 +; ALIGNED-NEXT: flat_store_byte v[86:87], v116 offset:244 +; ALIGNED-NEXT: flat_store_byte v[86:87], v117 offset:242 +; ALIGNED-NEXT: flat_store_byte v[86:87], v118 offset:240 +; ALIGNED-NEXT: flat_store_byte v[86:87], v119 offset:238 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:440 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:444 +; ALIGNED-NEXT: buffer_store_dword v113, off, s[0:3], s32 offset:436 +; ALIGNED-NEXT: buffer_store_dword v112, off, s[0:3], s32 offset:432 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v114 offset:228 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v115 offset:232 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:233 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:229 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v113 offset:224 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:225 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v112 offset:220 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:221 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v112 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v112 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:241 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v82 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_store_dword v87, off, s[0:3], s32 offset:444 -; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:436 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:432 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v86 offset:234 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v87 offset:238 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:236 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:232 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v85 offset:230 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:228 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v84 offset:226 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:224 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v83 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v81 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v81 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:235 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:233 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v80 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:239 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v82 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:232 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:230 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v83 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:236 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:234 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v81 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:228 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:226 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:224 ; ALIGNED-NEXT: s_waitcnt vmcnt(12) ; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:237 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:222 ; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:231 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:229 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:227 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:225 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v69 ; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:392 ; ALIGNED-NEXT: buffer_store_dword v83, off, s[0:3], s32 offset:396 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:388 ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:384 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:218 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:222 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:220 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:216 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:214 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:212 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:210 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:208 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v68 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v82 offset:212 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v83 offset:216 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:217 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:213 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v81 offset:208 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:209 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v80 offset:204 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:205 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v71 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v69 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v69 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:216 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:214 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:220 ; ALIGNED-NEXT: s_waitcnt vmcnt(11) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v66 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:219 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:217 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:223 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:221 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v65 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:215 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:213 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:209 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:218 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:212 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:210 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v67 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:208 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v65 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:206 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v65 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:408 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_store_dword v69, off, s[0:3], s32 offset:404 ; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:202 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:206 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:204 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:198 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:196 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:194 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:192 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v55 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v52 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v55 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:203 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:201 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:207 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v70 offset:196 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v71 offset:200 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:201 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:197 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v69 offset:192 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:193 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v68 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:189 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v64 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v64 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v54 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:200 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:198 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v55 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:204 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:202 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v52 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:196 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:194 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v53 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:192 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:205 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:199 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:197 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v51 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:195 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:193 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:190 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v50 ; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:488 ; ALIGNED-NEXT: buffer_store_dword v67, off, s[0:3], s32 offset:492 ; ALIGNED-NEXT: buffer_store_dword v65, off, s[0:3], s32 offset:484 ; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:480 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:186 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:190 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:188 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:184 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:182 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:180 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:178 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:176 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v48 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v48 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:187 -; ALIGNED-NEXT: s_waitcnt vmcnt(8) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v39 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v38 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:185 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v39 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:191 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:189 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:183 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:181 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:179 -; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:177 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v34 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v66 offset:180 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v67 offset:184 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:185 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:181 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v65 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:177 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v64 offset:172 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:173 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v51 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v49 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:184 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:182 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:188 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:186 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:180 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:178 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:176 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:174 ; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:508 ; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:504 ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:170 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:168 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:174 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:172 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:162 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:160 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:166 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:164 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v35 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v33 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v116, 8, v33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:171 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:169 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v32 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:173 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:163 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:161 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:175 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v30 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:167 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:165 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v29 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v54 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:165 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v55 offset:168 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:169 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v52 offset:156 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:157 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v53 offset:160 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:161 +; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v36 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v38 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v39 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v37 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v36 +; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v34 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:168 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:166 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v35 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:172 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:170 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:160 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:158 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:164 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:162 ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:456 ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:460 ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:448 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:154 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:158 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:156 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:152 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:150 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:148 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:146 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:144 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v28 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v28 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v50 offset:148 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v51 offset:152 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:153 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:149 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v49 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:145 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v48 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:141 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:152 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:150 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v28 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v31 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v29 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:155 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:153 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:159 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:157 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:151 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:149 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:147 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:145 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:154 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:148 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:146 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:144 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:142 ; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:472 ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:468 ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:138 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:142 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:140 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:136 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:134 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:132 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:130 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:128 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:143 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:139 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:137 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:141 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:135 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:133 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:131 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v38 offset:132 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v39 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:137 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:133 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v37 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:129 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v36 offset:124 +; ALIGNED-NEXT: flat_store_byte v[84:85], v36 offset:128 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:128 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v18 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v27 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:136 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:134 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:140 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:138 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:132 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:130 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:126 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 8, v18 ; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:122 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:126 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:124 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:120 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:118 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:116 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:114 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:112 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:123 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:121 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:127 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:125 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:119 -; ALIGNED-NEXT: flat_store_byte v[96:97], v116 offset:117 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:115 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v34 offset:116 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v35 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:121 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:117 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v33 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:113 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v32 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:109 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:120 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:118 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 24, v14 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:122 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:116 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:114 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:112 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:110 ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:106 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:110 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:108 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:104 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:102 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:100 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:98 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:96 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:111 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 24, v6 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 24, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v18 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v14 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:109 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v70, 8, v6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v30 offset:100 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v31 offset:104 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:105 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:101 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v29 offset:96 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:97 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v28 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:93 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:96 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_lshrrev_b32_e32 v114, 24, v6 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 24, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 8, v14 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:94 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v115, 8, v6 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] -; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 8, v27 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 24, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 24, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v86, 24, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:103 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v71, 24, v7 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v24 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v22 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v23 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v21 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v66, 24, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v67, 8, v20 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v98, 8, v19 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v84, 24, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v85, 8, v17 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v32, 24, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v87, 8, v15 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v53, 24, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 8, v13 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 8, v12 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v11 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v30, 24, v9 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:107 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v52, 24, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:105 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 8, v8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:101 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v7 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:99 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:97 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v99, 8, v25 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v36, 24, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v37, 8, v24 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v34, 24, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v113, 24, v15 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:104 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v82, 24, v11 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:102 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:108 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:106 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:100 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:98 ; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:268 ; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:260 ; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:90 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:94 -; ALIGNED-NEXT: flat_store_byte v[96:97], v27 offset:92 -; ALIGNED-NEXT: flat_store_byte v[96:97], v26 offset:88 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:86 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:84 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:82 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:80 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v4 -; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v50 offset:91 -; ALIGNED-NEXT: flat_store_byte v[96:97], v51 offset:89 -; ALIGNED-NEXT: flat_store_byte v[96:97], v103 offset:95 -; ALIGNED-NEXT: flat_store_byte v[96:97], v102 offset:93 -; ALIGNED-NEXT: flat_store_byte v[96:97], v101 offset:87 -; ALIGNED-NEXT: flat_store_byte v[96:97], v100 offset:85 -; ALIGNED-NEXT: flat_store_byte v[96:97], v80 offset:83 -; ALIGNED-NEXT: flat_store_byte v[96:97], v81 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v26 offset:84 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v27 offset:88 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:89 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:85 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v25 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:81 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v24 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:77 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v24, 24, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v38, 24, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v39, 8, v22 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v100, 24, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v101, 8, v23 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v102, 24, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v103, 8, v21 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v68, 24, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v69, 8, v20 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v35, 8, v17 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v112, 8, v15 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v80, 24, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v81, 8, v13 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v28, 24, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v29, 8, v12 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v83, 8, v11 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v48, 24, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v50, 24, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v51, 8, v8 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v25, 8, v7 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v26, 24, v5 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v27, 8, v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 offset:88 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v64, 24, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v65 offset:86 +; ALIGNED-NEXT: v_lshrrev_b32_e32 v65, 8, v4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v66 offset:92 +; ALIGNED-NEXT: flat_store_byte v[86:87], v67 offset:90 +; ALIGNED-NEXT: flat_store_byte v[86:87], v98 offset:84 +; ALIGNED-NEXT: flat_store_byte v[86:87], v99 offset:82 +; ALIGNED-NEXT: flat_store_byte v[86:87], v36 offset:80 +; ALIGNED-NEXT: flat_store_byte v[86:87], v37 offset:78 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:276 ; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:74 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:78 -; ALIGNED-NEXT: flat_store_byte v[96:97], v23 offset:76 -; ALIGNED-NEXT: flat_store_byte v[96:97], v22 offset:72 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:70 -; ALIGNED-NEXT: flat_store_byte v[96:97], v21 offset:68 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:66 -; ALIGNED-NEXT: flat_store_byte v[96:97], v20 offset:64 -; ALIGNED-NEXT: flat_store_byte v[96:97], v64 offset:75 -; ALIGNED-NEXT: flat_store_byte v[96:97], v65 offset:73 -; ALIGNED-NEXT: flat_store_byte v[96:97], v36 offset:79 -; ALIGNED-NEXT: flat_store_byte v[96:97], v37 offset:77 -; ALIGNED-NEXT: flat_store_byte v[96:97], v38 offset:71 -; ALIGNED-NEXT: flat_store_byte v[96:97], v39 offset:69 -; ALIGNED-NEXT: flat_store_byte v[96:97], v66 offset:67 -; ALIGNED-NEXT: flat_store_byte v[96:97], v67 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v22 offset:68 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v23 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v23 offset:73 +; ALIGNED-NEXT: flat_store_byte v[86:87], v22 offset:69 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v21 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v21 offset:65 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v20 offset:60 +; ALIGNED-NEXT: flat_store_byte v[84:85], v20 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v38 offset:72 +; ALIGNED-NEXT: flat_store_byte v[86:87], v39 offset:70 +; ALIGNED-NEXT: flat_store_byte v[86:87], v100 offset:76 +; ALIGNED-NEXT: flat_store_byte v[86:87], v101 offset:74 +; ALIGNED-NEXT: flat_store_byte v[86:87], v102 offset:68 +; ALIGNED-NEXT: flat_store_byte v[86:87], v103 offset:66 +; ALIGNED-NEXT: flat_store_byte v[86:87], v68 offset:64 +; ALIGNED-NEXT: flat_store_byte v[86:87], v69 offset:62 ; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:364 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:356 ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:352 -; ALIGNED-NEXT: flat_store_byte v[96:97], v82 offset:59 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:58 -; ALIGNED-NEXT: flat_store_byte v[96:97], v83 offset:57 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:62 -; ALIGNED-NEXT: flat_store_byte v[96:97], v99 offset:63 -; ALIGNED-NEXT: flat_store_byte v[96:97], v19 offset:60 -; ALIGNED-NEXT: flat_store_byte v[96:97], v98 offset:61 -; ALIGNED-NEXT: flat_store_byte v[96:97], v18 offset:56 -; ALIGNED-NEXT: flat_store_byte v[96:97], v84 offset:55 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:54 -; ALIGNED-NEXT: flat_store_byte v[96:97], v85 offset:53 -; ALIGNED-NEXT: flat_store_byte v[96:97], v17 offset:52 -; ALIGNED-NEXT: flat_store_byte v[96:97], v32 offset:51 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:50 -; ALIGNED-NEXT: flat_store_byte v[96:97], v33 offset:49 -; ALIGNED-NEXT: flat_store_byte v[96:97], v16 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v52 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v53 offset:54 +; ALIGNED-NEXT: flat_store_byte v[86:87], v32 offset:60 +; ALIGNED-NEXT: flat_store_byte v[86:87], v33 offset:58 +; ALIGNED-NEXT: flat_store_byte v[86:87], v34 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v18 offset:52 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v19 offset:56 +; ALIGNED-NEXT: flat_store_byte v[86:87], v19 offset:57 +; ALIGNED-NEXT: flat_store_byte v[86:87], v18 offset:53 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v17 offset:48 +; ALIGNED-NEXT: flat_store_byte v[86:87], v35 offset:50 +; ALIGNED-NEXT: flat_store_byte v[86:87], v17 offset:49 +; ALIGNED-NEXT: flat_store_byte v[86:87], v54 offset:48 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v16 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v55 offset:46 +; ALIGNED-NEXT: flat_store_byte v[86:87], v16 offset:45 ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:368 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:42 -; ALIGNED-NEXT: flat_store_byte v[96:97], v34 offset:43 -; ALIGNED-NEXT: flat_store_byte v[96:97], v35 offset:41 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:46 -; ALIGNED-NEXT: flat_store_byte v[96:97], v86 offset:47 -; ALIGNED-NEXT: flat_store_byte v[96:97], v15 offset:44 -; ALIGNED-NEXT: flat_store_byte v[96:97], v87 offset:45 -; ALIGNED-NEXT: flat_store_byte v[96:97], v14 offset:40 -; ALIGNED-NEXT: flat_store_byte v[96:97], v53 offset:39 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:38 -; ALIGNED-NEXT: flat_store_byte v[96:97], v54 offset:37 -; ALIGNED-NEXT: flat_store_byte v[96:97], v13 offset:36 -; ALIGNED-NEXT: flat_store_byte v[96:97], v55 offset:35 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:34 -; ALIGNED-NEXT: flat_store_byte v[96:97], v113 offset:33 -; ALIGNED-NEXT: flat_store_byte v[96:97], v12 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v14 offset:36 +; ALIGNED-NEXT: flat_store_byte v[86:87], v70 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v71 offset:38 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v15 offset:40 +; ALIGNED-NEXT: flat_store_byte v[86:87], v113 offset:44 +; ALIGNED-NEXT: flat_store_byte v[86:87], v15 offset:41 +; ALIGNED-NEXT: flat_store_byte v[86:87], v112 offset:42 +; ALIGNED-NEXT: flat_store_byte v[86:87], v14 offset:37 +; ALIGNED-NEXT: flat_store_byte v[86:87], v80 offset:36 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v13 offset:32 +; ALIGNED-NEXT: flat_store_byte v[86:87], v81 offset:34 +; ALIGNED-NEXT: flat_store_byte v[86:87], v13 offset:33 +; ALIGNED-NEXT: flat_store_byte v[86:87], v28 offset:32 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v12 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v29 offset:30 +; ALIGNED-NEXT: flat_store_byte v[84:85], v12 offset:32 ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:332 ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:324 ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:26 -; ALIGNED-NEXT: flat_store_byte v[96:97], v114 offset:27 -; ALIGNED-NEXT: flat_store_byte v[96:97], v115 offset:25 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:30 -; ALIGNED-NEXT: flat_store_byte v[96:97], v28 offset:31 -; ALIGNED-NEXT: flat_store_byte v[96:97], v11 offset:28 -; ALIGNED-NEXT: flat_store_byte v[96:97], v29 offset:29 -; ALIGNED-NEXT: flat_store_byte v[96:97], v10 offset:24 -; ALIGNED-NEXT: flat_store_byte v[96:97], v30 offset:23 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:22 -; ALIGNED-NEXT: flat_store_byte v[96:97], v31 offset:21 -; ALIGNED-NEXT: flat_store_byte v[96:97], v9 offset:20 -; ALIGNED-NEXT: flat_store_byte v[96:97], v52 offset:19 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:18 -; ALIGNED-NEXT: flat_store_byte v[96:97], v68 offset:17 -; ALIGNED-NEXT: flat_store_byte v[96:97], v8 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v10 offset:20 +; ALIGNED-NEXT: flat_store_byte v[86:87], v30 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v31 offset:22 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v11 offset:24 +; ALIGNED-NEXT: flat_store_byte v[86:87], v82 offset:28 +; ALIGNED-NEXT: flat_store_byte v[86:87], v11 offset:25 +; ALIGNED-NEXT: flat_store_byte v[86:87], v83 offset:26 +; ALIGNED-NEXT: flat_store_byte v[86:87], v10 offset:21 +; ALIGNED-NEXT: flat_store_byte v[86:87], v48 offset:20 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v9 offset:16 +; ALIGNED-NEXT: flat_store_byte v[86:87], v49 offset:18 +; ALIGNED-NEXT: flat_store_byte v[86:87], v9 offset:17 +; ALIGNED-NEXT: flat_store_byte v[86:87], v50 offset:16 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v8 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v51 offset:14 +; ALIGNED-NEXT: flat_store_byte v[84:85], v8 offset:16 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:10 -; ALIGNED-NEXT: flat_store_byte v[96:97], v69 offset:11 -; ALIGNED-NEXT: flat_store_byte v[96:97], v70 offset:9 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:14 -; ALIGNED-NEXT: flat_store_byte v[96:97], v71 offset:15 -; ALIGNED-NEXT: flat_store_byte v[96:97], v7 offset:12 -; ALIGNED-NEXT: flat_store_byte v[96:97], v112 offset:13 -; ALIGNED-NEXT: flat_store_byte v[96:97], v6 offset:8 -; ALIGNED-NEXT: flat_store_byte v[96:97], v48 offset:7 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 offset:6 -; ALIGNED-NEXT: flat_store_byte v[96:97], v49 offset:5 -; ALIGNED-NEXT: flat_store_byte v[96:97], v5 offset:4 -; ALIGNED-NEXT: flat_store_byte v[96:97], v24 offset:3 -; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v4 offset:2 -; ALIGNED-NEXT: flat_store_byte v[96:97], v25 offset:1 -; ALIGNED-NEXT: flat_store_byte v[96:97], v4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v6 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v114 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v115 offset:6 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v7 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v24 offset:12 +; ALIGNED-NEXT: flat_store_byte v[86:87], v7 offset:9 +; ALIGNED-NEXT: flat_store_byte v[86:87], v25 offset:10 +; ALIGNED-NEXT: flat_store_byte v[84:85], v6 offset:8 +; ALIGNED-NEXT: flat_store_byte v[86:87], v26 offset:4 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[96:97], v5 +; ALIGNED-NEXT: flat_store_byte v[86:87], v27 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v5 offset:4 +; ALIGNED-NEXT: flat_store_byte v[86:87], v64 +; ALIGNED-NEXT: flat_store_byte_d16_hi v[84:85], v4 offset:2 +; ALIGNED-NEXT: flat_store_byte v[84:85], v65 offset:1 +; ALIGNED-NEXT: flat_store_byte v[84:85], v4 ; ALIGNED-NEXT: s_cbranch_scc0 .LBB7_5 ; ALIGNED-NEXT: .LBB7_6: ; %Flow7 ; ALIGNED-NEXT: s_or_b32 exec_lo, exec_lo, s8 @@ -12503,25 +12669,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: s_add_u32 s4, s4, 0x100 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, 0 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:224 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:176 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:128 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[80:83] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -12607,29 +12775,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; CHECK-NEXT: v_add_co_u32 v100, vcc_lo, v0, s4 ; CHECK-NEXT: v_add_co_ci_u32_e64 v101, null, s5, v1, vcc_lo ; CHECK-NEXT: v_add_nc_u32_e32 v2, 0xffffff00, v2 +; CHECK-NEXT: v_add_co_u32 v102, vcc_lo, v100, 48 ; CHECK-NEXT: s_add_u32 s4, s4, 0xffffff00 +; CHECK-NEXT: v_add_co_ci_u32_e64 v103, null, 0, v101, vcc_lo ; CHECK-NEXT: s_addc_u32 s5, s5, -1 ; CHECK-NEXT: s_waitcnt vmcnt(35) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[31:34] offset:240 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[31:34] offset:192 ; CHECK-NEXT: s_waitcnt vmcnt(32) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[27:30] offset:224 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[27:30] offset:176 ; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[48:51] offset:208 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[48:51] offset:160 ; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[35:38] offset:192 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[35:38] offset:144 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[64:67] offset:176 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[64:67] offset:128 ; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[52:55] offset:160 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[52:55] offset:112 ; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[68:71] offset:144 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[68:71] offset:96 ; CHECK-NEXT: s_waitcnt vmcnt(8) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[80:83] offset:128 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[23:26] offset:112 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[19:22] offset:96 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[15:18] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[23:26] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[19:22] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[15:18] offset:32 ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[11:14] offset:64 -; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[7:10] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[102:103], v[7:10] ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[3:6] offset:32 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: flat_store_dwordx4 v[100:101], v[96:99] offset:16 @@ -12736,18 +12906,18 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:47 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:48 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:50 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:51 ; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:56 ; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:57 ; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:58 ; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:62 ; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:63 ; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:64 ; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:65 @@ -12823,21 +12993,21 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v15, v30, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v32, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v36, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v34, 8, v31 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v37 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v37 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v51, 8, v52 @@ -12890,17 +13060,17 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill @@ -12917,7 +13087,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill @@ -12965,37 +13135,37 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:93 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:90 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:89 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 @@ -13102,7 +13272,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -13139,9 +13309,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1264 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1256 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -13154,7 +13324,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 @@ -13163,31 +13333,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -13205,577 +13375,586 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v110 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v109 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v95 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v104 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 ; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:175 ; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v79, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v74 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:178 ; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:181 ; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v59 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v62 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v72 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:188 ; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:191 ; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v56 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v45, 8, v46 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v42, 8, v43 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v40 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:193 ; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:196 ; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:205 ; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:207 ; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:217 ; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:220 ; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:226 ; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:229 ; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:237 ; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 ; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 ; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:233 ; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 ; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:244 ; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 ; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:247 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:254 ; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: s_clause 0x2 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 -; ALIGNED-NEXT: s_waitcnt vmcnt(61) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(59) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v116, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v113 +; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101 +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v98, 8, v100 +; ALIGNED-NEXT: v_lshl_or_b32 v108, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v103, 8, v114 ; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 -; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 +; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v87, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v86 +; ALIGNED-NEXT: v_lshl_or_b32 v92, v1, 16, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v5, v15, 8, v20 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v13 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v121, v9, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v70, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v68, 8, v82 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: v_lshl_or_b32 v57, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 -; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v81 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v66 +; ALIGNED-NEXT: v_lshl_or_b32 v118, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v51 +; ALIGNED-NEXT: v_lshl_or_b32 v102, v4, 16, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v35 +; ALIGNED-NEXT: v_lshl_or_b32 v85, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v34, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v80, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v33 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 -; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v54, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 -; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v52, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v23 ; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 -; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 +; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v22 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 +; ALIGNED-NEXT: v_lshl_or_b32 v24, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 8, v14 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v16, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v6, v8, 8, v10 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_dword v124, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v6, v121, 16, v6 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v122, v0, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v121, v121, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v121, v5, 8, v124 +; ALIGNED-NEXT: v_lshl_or_b32 v122, v4, 8, v125 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v121, v4, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v122, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_mov_b32_e32 v4, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v122, v5, 8, v3 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v109, v5, 8, v1 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v121, v1, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 16, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:18 ; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 -; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 +; ALIGNED-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:704 +; ALIGNED-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v95, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:240 -; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:244 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:224 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:192 -; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v124 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: v_lshl_or_b32 v127, v1, 8, v125 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_add_co_u32 v121, vcc_lo, v5, s4 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v122, null, s5, v6, vcc_lo +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: v_add_co_u32 v5, vcc_lo, v121, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v122, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[5:6], v7 offset:247 +; ALIGNED-NEXT: flat_store_byte v[5:6], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[5:6], v8 offset:246 +; ALIGNED-NEXT: flat_store_byte v[5:6], v11 offset:252 +; ALIGNED-NEXT: flat_store_byte v[5:6], v12 offset:250 +; ALIGNED-NEXT: flat_store_byte v[5:6], v13 offset:251 +; ALIGNED-NEXT: flat_store_byte v[5:6], v14 offset:249 +; ALIGNED-NEXT: flat_store_byte v[5:6], v10 offset:245 +; ALIGNED-NEXT: flat_store_byte v[5:6], v17 offset:239 +; ALIGNED-NEXT: flat_store_byte v[5:6], v18 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v21 offset:238 +; ALIGNED-NEXT: flat_store_byte v[5:6], v15 offset:244 +; ALIGNED-NEXT: flat_store_byte v[5:6], v19 offset:242 +; ALIGNED-NEXT: flat_store_byte v[5:6], v20 offset:243 +; ALIGNED-NEXT: flat_store_byte v[5:6], v22 offset:241 +; ALIGNED-NEXT: flat_store_byte v[5:6], v23 offset:237 +; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v85, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[5:6], v25 offset:231 +; ALIGNED-NEXT: flat_store_byte v[5:6], v29 offset:232 +; ALIGNED-NEXT: flat_store_byte v[5:6], v26 offset:230 +; ALIGNED-NEXT: flat_store_byte v[5:6], v30 offset:236 +; ALIGNED-NEXT: flat_store_byte v[5:6], v31 offset:234 +; ALIGNED-NEXT: flat_store_byte v[5:6], v32 offset:235 +; ALIGNED-NEXT: flat_store_byte v[5:6], v33 offset:233 +; ALIGNED-NEXT: flat_store_byte v[5:6], v28 offset:229 +; ALIGNED-NEXT: flat_store_byte v[5:6], v35 offset:223 +; ALIGNED-NEXT: flat_store_byte v[5:6], v36 offset:224 +; ALIGNED-NEXT: flat_store_byte v[5:6], v39 offset:222 +; ALIGNED-NEXT: flat_store_byte v[5:6], v34 offset:228 +; ALIGNED-NEXT: flat_store_byte v[5:6], v37 offset:226 +; ALIGNED-NEXT: flat_store_byte v[5:6], v38 offset:227 +; ALIGNED-NEXT: flat_store_byte v[5:6], v48 offset:225 +; ALIGNED-NEXT: flat_store_byte v[5:6], v49 offset:221 +; ALIGNED-NEXT: buffer_store_dword v102, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v118, off, s[0:3], s32 offset:204 ; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 -; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:208 -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v70 offset:210 +; ALIGNED-NEXT: flat_store_byte v[5:6], v68 offset:212 +; ALIGNED-NEXT: flat_store_byte v[5:6], v50 offset:206 +; ALIGNED-NEXT: flat_store_byte v[5:6], v71 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v51 offset:207 +; ALIGNED-NEXT: flat_store_byte v[5:6], v82 offset:211 +; ALIGNED-NEXT: flat_store_byte v[5:6], v84 offset:209 +; ALIGNED-NEXT: flat_store_byte v[5:6], v65 offset:215 +; ALIGNED-NEXT: flat_store_byte v[5:6], v64 offset:216 +; ALIGNED-NEXT: flat_store_byte v[5:6], v69 offset:214 +; ALIGNED-NEXT: flat_store_byte v[5:6], v55 offset:220 +; ALIGNED-NEXT: flat_store_byte v[5:6], v67 offset:218 +; ALIGNED-NEXT: flat_store_byte v[5:6], v66 offset:219 +; ALIGNED-NEXT: flat_store_byte v[5:6], v81 offset:217 +; ALIGNED-NEXT: flat_store_byte v[5:6], v83 offset:213 +; ALIGNED-NEXT: flat_store_byte v[5:6], v53 offset:205 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:216 ; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 -; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v108, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[5:6], v86 offset:199 +; ALIGNED-NEXT: flat_store_byte v[5:6], v97 offset:200 +; ALIGNED-NEXT: flat_store_byte v[5:6], v87 offset:198 +; ALIGNED-NEXT: flat_store_byte v[5:6], v98 offset:204 +; ALIGNED-NEXT: flat_store_byte v[5:6], v99 offset:202 +; ALIGNED-NEXT: flat_store_byte v[5:6], v100 offset:203 +; ALIGNED-NEXT: flat_store_byte v[5:6], v101 offset:201 +; ALIGNED-NEXT: flat_store_byte v[5:6], v96 offset:197 +; ALIGNED-NEXT: flat_store_byte v[5:6], v113 offset:191 +; ALIGNED-NEXT: flat_store_byte v[5:6], v112 offset:192 +; ALIGNED-NEXT: flat_store_byte v[5:6], v116 offset:190 +; ALIGNED-NEXT: flat_store_byte v[5:6], v103 offset:196 +; ALIGNED-NEXT: flat_store_byte v[5:6], v115 offset:194 +; ALIGNED-NEXT: flat_store_byte v[5:6], v114 offset:195 +; ALIGNED-NEXT: flat_store_byte v[5:6], v117 offset:193 +; ALIGNED-NEXT: flat_store_byte v[5:6], v119 offset:189 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1496 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v40 offset:183 +; ALIGNED-NEXT: flat_store_byte v[5:6], v44 offset:184 +; ALIGNED-NEXT: flat_store_byte v[5:6], v42 offset:182 +; ALIGNED-NEXT: flat_store_byte v[5:6], v45 offset:188 +; ALIGNED-NEXT: flat_store_byte v[5:6], v47 offset:186 +; ALIGNED-NEXT: flat_store_byte v[5:6], v46 offset:187 +; ALIGNED-NEXT: flat_store_byte v[5:6], v56 offset:185 +; ALIGNED-NEXT: flat_store_byte v[5:6], v43 offset:181 +; ALIGNED-NEXT: flat_store_byte v[5:6], v59 offset:175 +; ALIGNED-NEXT: flat_store_byte v[5:6], v60 offset:176 +; ALIGNED-NEXT: flat_store_byte v[5:6], v63 offset:174 +; ALIGNED-NEXT: flat_store_byte v[5:6], v58 offset:180 +; ALIGNED-NEXT: flat_store_byte v[5:6], v61 offset:178 +; ALIGNED-NEXT: flat_store_byte v[5:6], v62 offset:179 +; ALIGNED-NEXT: flat_store_byte v[5:6], v72 offset:177 +; ALIGNED-NEXT: flat_store_byte v[5:6], v73 offset:173 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v74 offset:167 +; ALIGNED-NEXT: flat_store_byte v[5:6], v78 offset:168 +; ALIGNED-NEXT: flat_store_byte v[5:6], v75 offset:166 +; ALIGNED-NEXT: flat_store_byte v[5:6], v79 offset:172 +; ALIGNED-NEXT: flat_store_byte v[5:6], v89 offset:170 +; ALIGNED-NEXT: flat_store_byte v[5:6], v88 offset:171 +; ALIGNED-NEXT: flat_store_byte v[5:6], v90 offset:169 +; ALIGNED-NEXT: flat_store_byte v[5:6], v77 offset:165 +; ALIGNED-NEXT: flat_store_byte v[5:6], v95 offset:159 +; ALIGNED-NEXT: flat_store_byte v[5:6], v94 offset:160 +; ALIGNED-NEXT: flat_store_byte v[5:6], v106 offset:158 +; ALIGNED-NEXT: flat_store_byte v[5:6], v93 offset:164 +; ALIGNED-NEXT: flat_store_byte v[5:6], v105 offset:162 +; ALIGNED-NEXT: flat_store_byte v[5:6], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[5:6], v107 offset:161 +; ALIGNED-NEXT: flat_store_byte v[5:6], v109 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 +; ALIGNED-NEXT: flat_store_byte v[5:6], v110 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v120 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v111 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v126 offset:156 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:152 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:155 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[5:6], v123 offset:149 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:144 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:147 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:145 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:135 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:138 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:139 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:130 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:127 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:129 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:126 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:134 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:131 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:128 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 @@ -13790,52 +13969,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 @@ -13850,112 +14029,112 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:90 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:89 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:95 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:88 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:92 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:90 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:92 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:89 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 @@ -13970,52 +14149,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:64 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 @@ -14028,61 +14207,61 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:63 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:57 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:56 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:53 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:49 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:46 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:55 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:52 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload @@ -14090,52 +14269,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:42 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:39 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:40 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:44 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:44 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:41 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:35 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:32 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:36 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:33 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 @@ -14148,295 +14327,298 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v124 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:17 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[5:6], v1 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[121:122], v125 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v3 offset:7 +; ALIGNED-NEXT: flat_store_byte v[5:6], v4 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:10 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:6 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:12 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:12 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:9 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:8 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:2 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:1 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 +; ALIGNED-NEXT: flat_store_byte v[5:6], v0 offset:3 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:4 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 offset:4 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 +; ALIGNED-NEXT: flat_store_byte v[121:122], v0 ; ALIGNED-NEXT: s_cbranch_scc1 .LBB9_1 ; ALIGNED-NEXT: .LBB9_2: ; %Flow10 ; ALIGNED-NEXT: s_andn2_saveexec_b32 s8, s6 ; ALIGNED-NEXT: s_cbranch_execz .LBB9_5 ; ALIGNED-NEXT: ; %bb.3: ; %memmove_bwd_loop.preheader -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0x700, v2 +; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0x700, v2 ; ALIGNED-NEXT: s_movk_i32 s6, 0xff00 ; ALIGNED-NEXT: s_mov_b64 s[4:5], 0x700 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x39 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:23 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:31 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:32 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:33 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:34 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:27 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:35 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:36 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:37 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:38 -; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:39 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:40 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:41 -; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:42 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:43 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:44 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:45 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:46 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:47 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:48 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:49 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:50 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:51 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:52 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:53 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:54 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:55 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:56 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:57 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:58 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:60 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:61 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:62 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:63 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:64 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:65 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:66 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:59 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:67 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:68 -; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:69 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:70 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:71 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:76 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:77 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:20 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:21 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:22 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:23 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:24 +; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:25 +; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:26 +; ALIGNED-NEXT: buffer_load_ubyte v127, v6, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:28 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:29 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:30 +; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:31 +; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:32 +; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:33 +; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:34 +; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:27 +; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:35 +; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:36 +; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:37 +; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:38 +; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:39 +; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:40 +; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:41 +; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:42 +; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:43 +; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:44 +; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:45 +; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:46 +; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:47 +; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:48 +; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:49 +; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:50 +; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:51 +; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:52 +; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:53 +; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:54 +; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:55 +; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:56 +; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:57 +; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:58 +; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:60 +; ALIGNED-NEXT: buffer_load_ubyte v50, v6, s[0:3], 0 offen offset:61 +; ALIGNED-NEXT: buffer_load_ubyte v48, v6, s[0:3], 0 offen offset:62 +; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:63 +; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:64 +; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:65 +; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:66 +; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:59 +; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:67 +; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:68 +; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:69 +; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:70 +; ALIGNED-NEXT: buffer_load_ubyte v68, v6, s[0:3], 0 offen offset:71 +; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:76 +; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:77 +; ALIGNED-NEXT: buffer_load_ubyte v81, v6, s[0:3], 0 offen offset:75 +; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:78 +; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:79 +; ALIGNED-NEXT: buffer_load_ubyte v124, v6, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(56) +; ALIGNED-NEXT: s_waitcnt vmcnt(57) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(55) +; ALIGNED-NEXT: s_waitcnt vmcnt(56) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: s_waitcnt vmcnt(55) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(54) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(51) +; ALIGNED-NEXT: s_waitcnt vmcnt(52) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(50) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(47) +; ALIGNED-NEXT: s_waitcnt vmcnt(48) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: s_waitcnt vmcnt(47) ; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(45) +; ALIGNED-NEXT: s_waitcnt vmcnt(46) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(42) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v4 +; ALIGNED-NEXT: s_waitcnt vmcnt(43) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(41) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(39) +; ALIGNED-NEXT: v_lshl_or_b32 v8, v20, 8, v19 +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(37) -; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(35) -; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(33) -; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(31) -; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 +; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: v_lshl_or_b32 v10, v18, 8, v17 +; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: v_lshl_or_b32 v11, v26, 8, v22 +; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v27 +; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: v_lshl_or_b32 v13, v23, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(29) -; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 +; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: v_lshl_or_b32 v14, v25, 8, v24 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v4 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v10, 16, v9 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(27) +; ALIGNED-NEXT: s_waitcnt vmcnt(28) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(25) +; ALIGNED-NEXT: s_waitcnt vmcnt(26) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(23) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(24) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(21) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(22) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(16) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 +; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v39 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v49, 8, v48 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v5, v51, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(11) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v5, 16, v3 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:85 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v4, 16, v3 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:85 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v7, 16, v5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: s_waitcnt vmcnt(11) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(8) +; ALIGNED-NEXT: s_waitcnt vmcnt(9) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:86 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:82 -; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:86 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:82 +; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:83 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 -; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:73 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:73 ; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:72 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:72 ; ALIGNED-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill @@ -14445,11 +14627,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill @@ -14465,13 +14647,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) @@ -14484,11 +14666,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v81, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:84 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:84 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:81 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:81 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:80 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:80 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1040 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14499,48 +14681,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:98 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:98 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:102 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:103 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:102 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:103 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:94 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:95 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:94 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:95 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:93 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:91 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:93 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:91 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:92 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:92 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:90 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:101 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:101 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:89 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:89 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:88 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:88 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14548,13 +14730,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:99 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:100 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:99 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:100 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:97 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:97 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:96 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:96 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14565,48 +14747,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:114 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:114 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:118 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:119 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:118 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:119 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:110 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:111 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:110 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:109 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:107 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:109 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:107 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:108 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:108 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:106 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:106 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:117 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:117 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:105 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:105 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:104 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:104 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14614,13 +14796,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:115 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:116 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:115 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:116 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:113 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:113 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:112 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:112 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14631,48 +14813,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:130 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:130 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:134 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:135 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:134 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:135 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:126 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:127 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:126 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:127 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:125 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:123 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:125 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:123 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:124 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:124 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1276 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1280 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1224 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:122 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:122 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1240 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:133 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:133 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:121 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:121 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1232 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:120 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:120 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1244 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1272 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1236 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14680,13 +14862,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:131 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:132 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:131 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:132 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1268 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:129 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:129 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1248 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:128 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:128 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) @@ -14697,48 +14879,48 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_load_ubyte v5, v6, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:142 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:143 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:142 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:143 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:141 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:141 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:140 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:138 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:149 +; ALIGNED-NEXT: buffer_load_ubyte v4, v6, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:137 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:136 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14746,107 +14928,106 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:144 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v3, v6, s[0:3], 0 offen offset:159 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 -; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v6, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:157 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v6, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v6, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v6, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v108, v6, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v6, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v94, v6, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v6, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v106, v6, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v6, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v6, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v6, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v104 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v106 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v78, v6, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v77, v6, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v76, v6, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v75, v6, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v74, v6, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v78 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v76 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v73, v6, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v63, v6, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v62, v6, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v62 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v61, v6, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v6, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v6, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v6, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v6, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v6, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v6, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v6, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) @@ -14859,11 +15040,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v44, v6, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v6, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v6, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v6, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v6, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) @@ -14871,289 +15052,291 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v6, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v6, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v116, v6, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v116 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x3e -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v4, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v4, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v4, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v4, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v11, v4, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v9, v4, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: buffer_load_ubyte v115, v6, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v6, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v6, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v6, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v6, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v6, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v6, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v6, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v6, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v6, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v6, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v6, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v6, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v6, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v6, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v6, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v80, v6, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v67, v6, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v6, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v64, v6, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v65, v6, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v71, v6, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v66, v6, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v6, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v6, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v6, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v6, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v6, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v6, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v49, v6, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v6, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v6, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v37, v6, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v6, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v6, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v6, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v6, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v6, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v6, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v6, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v29, v6, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v6, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v6, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v6, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v6, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v24, v6, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v6, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v22, v6, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v21, v6, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v19, v6, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v6, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v16, v6, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v20, v6, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v6, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v18, v6, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v14, v6, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v13, v6, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v6, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v6, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v10, v6, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v6, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v8, v6, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v7, v6, s[0:3], 0 offen offset:249 ; ALIGNED-NEXT: s_clause 0x6 -; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_load_ubyte v1, v6, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:7 ; ALIGNED-NEXT: s_waitcnt vmcnt(62) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 -; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 -; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(60) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v112, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v100, 8, v101 ; ALIGNED-NEXT: s_waitcnt vmcnt(58) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 +; ALIGNED-NEXT: v_lshl_or_b32 v107, v2, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v2, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v18 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v12 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: v_lshl_or_b32 v88, v9, 8, v1 +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen +; ALIGNED-NEXT: v_lshl_or_b32 v79, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v89, 8, v120 +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v72, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v70 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v66, 8, v71 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 -; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v117, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 ; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v39 ; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 ; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 -; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v68, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 -; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v50, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v23, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v22 ; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v2, v19, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v16, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v11, 8, v13 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v7, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v5, v88, 16, v5 +; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:1 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v62, v62, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 -; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v88, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v109, 8, v93 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v121, 8, v110 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v6, s[0:3], 0 offen offset:12 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v125, v6, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:14 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v125, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v6, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v120, v6, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v89, v126, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v121, v6, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v110, v6, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v6, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 +; ALIGNED-NEXT: v_lshl_or_b32 v88, v110, 8, v121 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v89, v120, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 16, v88 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_load_ubyte v88, v6, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v93, v6, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v89, v6, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:488 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:492 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 -; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 +; ALIGNED-NEXT: v_add_nc_u32_e32 v6, 0xffffff00, v6 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v88 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v125, v76, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v89, 8, v93 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v3, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[2:3], v1 offset:250 -; ALIGNED-NEXT: flat_store_byte v[2:3], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249 -; ALIGNED-NEXT: flat_store_byte v[2:3], v8 offset:255 -; ALIGNED-NEXT: flat_store_byte v[2:3], v9 offset:253 -; ALIGNED-NEXT: flat_store_byte v[2:3], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[2:3], v11 offset:252 -; ALIGNED-NEXT: flat_store_byte v[2:3], v6 offset:248 -; ALIGNED-NEXT: flat_store_byte v[2:3], v13 offset:242 -; ALIGNED-NEXT: flat_store_byte v[2:3], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[2:3], v17 offset:241 -; ALIGNED-NEXT: flat_store_byte v[2:3], v12 offset:247 -; ALIGNED-NEXT: flat_store_byte v[2:3], v15 offset:245 -; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 -; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: v_add_co_u32 v4, vcc_lo, v2, 3 +; ALIGNED-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v3, vcc_lo +; ALIGNED-NEXT: flat_store_byte v[4:5], v1 offset:247 +; ALIGNED-NEXT: flat_store_byte v[4:5], v9 offset:248 +; ALIGNED-NEXT: flat_store_byte v[4:5], v7 offset:246 +; ALIGNED-NEXT: flat_store_byte v[4:5], v10 offset:252 +; ALIGNED-NEXT: flat_store_byte v[4:5], v11 offset:250 +; ALIGNED-NEXT: flat_store_byte v[4:5], v12 offset:251 +; ALIGNED-NEXT: flat_store_byte v[4:5], v13 offset:249 +; ALIGNED-NEXT: flat_store_byte v[4:5], v8 offset:245 +; ALIGNED-NEXT: flat_store_byte v[4:5], v15 offset:239 +; ALIGNED-NEXT: flat_store_byte v[4:5], v16 offset:240 +; ALIGNED-NEXT: flat_store_byte v[4:5], v19 offset:238 +; ALIGNED-NEXT: flat_store_byte v[4:5], v14 offset:244 +; ALIGNED-NEXT: flat_store_byte v[4:5], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[4:5], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[4:5], v20 offset:241 +; ALIGNED-NEXT: flat_store_byte v[4:5], v21 offset:237 ; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v68, off, s[0:3], s32 offset:500 ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:233 -; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:239 -; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:237 -; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:238 -; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:236 -; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:226 -; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:227 -; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:225 -; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:231 -; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:229 -; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:230 -; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:228 -; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:224 +; ALIGNED-NEXT: flat_store_byte v[4:5], v22 offset:231 +; ALIGNED-NEXT: flat_store_byte v[4:5], v25 offset:232 +; ALIGNED-NEXT: flat_store_byte v[4:5], v23 offset:230 +; ALIGNED-NEXT: flat_store_byte v[4:5], v26 offset:236 +; ALIGNED-NEXT: flat_store_byte v[4:5], v27 offset:234 +; ALIGNED-NEXT: flat_store_byte v[4:5], v28 offset:235 +; ALIGNED-NEXT: flat_store_byte v[4:5], v29 offset:233 +; ALIGNED-NEXT: flat_store_byte v[4:5], v24 offset:229 +; ALIGNED-NEXT: flat_store_byte v[4:5], v31 offset:223 +; ALIGNED-NEXT: flat_store_byte v[4:5], v32 offset:224 +; ALIGNED-NEXT: flat_store_byte v[4:5], v35 offset:222 +; ALIGNED-NEXT: flat_store_byte v[4:5], v30 offset:228 +; ALIGNED-NEXT: flat_store_byte v[4:5], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[4:5], v34 offset:227 +; ALIGNED-NEXT: flat_store_byte v[4:5], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[4:5], v37 offset:221 ; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448 ; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460 -; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:456 ; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:213 -; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:215 -; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:209 -; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:211 -; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:210 -; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:214 -; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:212 -; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:218 -; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:219 -; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:217 -; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:223 -; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:221 -; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:222 -; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:220 -; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:216 -; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:208 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:472 -; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: flat_store_byte v[4:5], v67 offset:210 +; ALIGNED-NEXT: flat_store_byte v[4:5], v64 offset:212 +; ALIGNED-NEXT: flat_store_byte v[4:5], v38 offset:206 +; ALIGNED-NEXT: flat_store_byte v[4:5], v65 offset:208 +; ALIGNED-NEXT: flat_store_byte v[4:5], v39 offset:207 +; ALIGNED-NEXT: flat_store_byte v[4:5], v70 offset:211 +; ALIGNED-NEXT: flat_store_byte v[4:5], v80 offset:209 +; ALIGNED-NEXT: flat_store_byte v[4:5], v53 offset:215 +; ALIGNED-NEXT: flat_store_byte v[4:5], v52 offset:216 +; ALIGNED-NEXT: flat_store_byte v[4:5], v66 offset:214 +; ALIGNED-NEXT: flat_store_byte v[4:5], v51 offset:220 +; ALIGNED-NEXT: flat_store_byte v[4:5], v55 offset:218 +; ALIGNED-NEXT: flat_store_byte v[4:5], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[4:5], v69 offset:217 +; ALIGNED-NEXT: flat_store_byte v[4:5], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[4:5], v49 offset:205 +; ALIGNED-NEXT: buffer_store_dword v72, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v79, off, s[0:3], s32 offset:476 ; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 -; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 -; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 -; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:207 -; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:205 -; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:206 -; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:204 -; ALIGNED-NEXT: flat_store_byte v[2:3], v84 offset:200 -; ALIGNED-NEXT: flat_store_byte v[2:3], v101 offset:194 -; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:195 -; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:193 -; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:199 -; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:197 -; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 -; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 -; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 -; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[4:5], v82 offset:199 +; ALIGNED-NEXT: flat_store_byte v[4:5], v85 offset:200 +; ALIGNED-NEXT: flat_store_byte v[4:5], v83 offset:198 +; ALIGNED-NEXT: flat_store_byte v[4:5], v86 offset:204 +; ALIGNED-NEXT: flat_store_byte v[4:5], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[4:5], v96 offset:203 +; ALIGNED-NEXT: flat_store_byte v[4:5], v97 offset:201 +; ALIGNED-NEXT: flat_store_byte v[4:5], v84 offset:197 +; ALIGNED-NEXT: flat_store_byte v[4:5], v101 offset:191 +; ALIGNED-NEXT: flat_store_byte v[4:5], v100 offset:192 +; ALIGNED-NEXT: flat_store_byte v[4:5], v112 offset:190 +; ALIGNED-NEXT: flat_store_byte v[4:5], v99 offset:196 +; ALIGNED-NEXT: flat_store_byte v[4:5], v103 offset:194 +; ALIGNED-NEXT: flat_store_byte v[4:5], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[4:5], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[4:5], v115 offset:189 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 @@ -15169,22 +15352,22 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 -; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 -; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 -; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 -; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:191 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:189 -; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:190 -; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:188 -; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:184 -; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:178 -; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:179 -; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:177 -; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:183 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:181 -; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 -; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 -; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 +; ALIGNED-NEXT: flat_store_byte v[4:5], v116 offset:183 +; ALIGNED-NEXT: flat_store_byte v[4:5], v40 offset:184 +; ALIGNED-NEXT: flat_store_byte v[4:5], v118 offset:182 +; ALIGNED-NEXT: flat_store_byte v[4:5], v41 offset:188 +; ALIGNED-NEXT: flat_store_byte v[4:5], v43 offset:186 +; ALIGNED-NEXT: flat_store_byte v[4:5], v42 offset:187 +; ALIGNED-NEXT: flat_store_byte v[4:5], v44 offset:185 +; ALIGNED-NEXT: flat_store_byte v[4:5], v119 offset:181 +; ALIGNED-NEXT: flat_store_byte v[4:5], v47 offset:175 +; ALIGNED-NEXT: flat_store_byte v[4:5], v56 offset:176 +; ALIGNED-NEXT: flat_store_byte v[4:5], v59 offset:174 +; ALIGNED-NEXT: flat_store_byte v[4:5], v46 offset:180 +; ALIGNED-NEXT: flat_store_byte v[4:5], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[4:5], v58 offset:179 +; ALIGNED-NEXT: flat_store_byte v[4:5], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[4:5], v61 offset:173 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 @@ -15197,23 +15380,23 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 -; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 -; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:175 -; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:173 -; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:174 -; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:172 -; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:162 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:163 -; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:161 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 -; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v62 offset:167 +; ALIGNED-NEXT: flat_store_byte v[4:5], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[4:5], v63 offset:166 +; ALIGNED-NEXT: flat_store_byte v[4:5], v75 offset:172 +; ALIGNED-NEXT: flat_store_byte v[4:5], v77 offset:170 +; ALIGNED-NEXT: flat_store_byte v[4:5], v76 offset:171 +; ALIGNED-NEXT: flat_store_byte v[4:5], v78 offset:169 +; ALIGNED-NEXT: flat_store_byte v[4:5], v73 offset:165 +; ALIGNED-NEXT: flat_store_byte v[4:5], v94 offset:159 +; ALIGNED-NEXT: flat_store_byte v[4:5], v92 offset:160 +; ALIGNED-NEXT: flat_store_byte v[4:5], v105 offset:158 +; ALIGNED-NEXT: flat_store_byte v[4:5], v91 offset:164 +; ALIGNED-NEXT: flat_store_byte v[4:5], v95 offset:162 +; ALIGNED-NEXT: flat_store_byte v[4:5], v104 offset:163 +; ALIGNED-NEXT: flat_store_byte v[4:5], v106 offset:161 +; ALIGNED-NEXT: flat_store_byte v[4:5], v108 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload @@ -15225,44 +15408,46 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v111 offset:151 +; ALIGNED-NEXT: flat_store_byte v[4:5], v124 offset:152 +; ALIGNED-NEXT: flat_store_byte v[4:5], v122 offset:150 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:156 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:154 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:158 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:155 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:156 -; ALIGNED-NEXT: flat_store_byte v[2:3], v123 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:153 +; ALIGNED-NEXT: flat_store_byte v[4:5], v123 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:143 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:147 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:144 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:142 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:148 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:145 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:144 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:141 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 @@ -15277,49 +15462,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:138 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:135 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:139 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:136 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:137 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:134 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:143 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:141 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:138 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:142 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1288 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:140 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:137 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:136 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:133 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1268 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:130 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:127 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1260 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:131 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:128 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1256 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:129 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1280 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:135 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:132 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1272 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:133 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:130 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1276 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:134 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:131 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1264 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:132 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:129 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:128 @@ -15337,52 +15522,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1244 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:122 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:119 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1240 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:123 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:120 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1236 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:121 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:118 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1224 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:127 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:124 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:125 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:122 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:126 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:123 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:124 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:121 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:120 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:117 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:114 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:111 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:115 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:112 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:113 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:110 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:119 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:116 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:117 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:114 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:118 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:115 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:116 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:113 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:112 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:109 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 @@ -15397,52 +15582,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:106 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:103 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:107 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:104 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:105 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:102 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:111 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:108 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:109 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:106 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:110 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:107 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:108 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:105 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:104 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:101 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:98 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:95 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:99 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:96 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:97 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:94 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:103 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:100 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:101 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:98 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:102 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:99 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:100 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:97 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:96 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:93 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 @@ -15457,52 +15642,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:90 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:87 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1080 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:91 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:88 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:89 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:86 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:95 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:92 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1056 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:93 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:90 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1060 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:94 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:91 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:92 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:89 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:88 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:85 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:82 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:79 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:83 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:80 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1016 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:81 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:78 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1040 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:87 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:84 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1032 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:85 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:82 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1036 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:86 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:83 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1024 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:84 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:81 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1012 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:80 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:77 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1008 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 @@ -15517,49 +15702,49 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1004 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:74 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:71 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1000 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:75 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:72 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:73 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:70 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:79 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:76 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:77 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:74 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:78 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:75 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:76 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:73 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:72 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:69 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:66 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:63 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:67 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:64 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:65 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:62 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:71 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:68 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:69 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:66 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:70 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:67 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:68 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:65 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:64 @@ -15577,52 +15762,52 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:61 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:58 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:58 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:55 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:59 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:56 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:57 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:54 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:63 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:60 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:59 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:62 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:57 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:60 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:56 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:53 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:53 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:50 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:50 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:47 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:51 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:48 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:49 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:46 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:55 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:52 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:54 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:51 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:52 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:49 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:48 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:45 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 @@ -15637,149 +15822,147 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:43 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:40 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:42 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:39 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:41 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:38 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:40 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:47 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:37 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:46 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:44 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:43 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:45 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:42 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:44 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:41 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:35 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:32 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:34 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:31 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:33 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:30 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:32 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:39 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:36 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:38 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:35 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:37 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:34 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:36 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:33 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:640 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:23 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:27 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:24 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:25 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:22 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:31 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:28 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:29 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:26 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:30 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:27 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:28 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:25 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:18 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[4:5], v88 offset:15 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:17 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:16 +; ALIGNED-NEXT: flat_store_byte v[4:5], v89 offset:14 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:20 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:21 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:18 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:22 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:19 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:16 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:17 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:16 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 +; ALIGNED-NEXT: flat_store_byte v[4:5], v109 offset:7 +; ALIGNED-NEXT: flat_store_byte v[4:5], v120 offset:8 +; ALIGNED-NEXT: flat_store_byte v[4:5], v125 offset:10 +; ALIGNED-NEXT: flat_store_byte v[4:5], v110 offset:6 +; ALIGNED-NEXT: flat_store_byte v[4:5], v126 offset:12 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:11 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:8 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:9 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[4:5], v0 offset:3 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:4 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 9e2906cf85432..bf516f8b91c91 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -3486,7 +3486,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic ret void } @@ -3772,7 +3772,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic ret void } @@ -4052,7 +4052,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic ret void } @@ -4365,7 +4365,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic ret void } @@ -4678,7 +4678,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic ret void } @@ -4964,7 +4964,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire ret void } @@ -5250,7 +5250,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire ret void } @@ -5563,7 +5563,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire ret void } @@ -5876,7 +5876,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire ret void } @@ -6189,7 +6189,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire ret void } @@ -6502,7 +6502,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst ret void } @@ -6815,7 +6815,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst ret void } @@ -7128,7 +7128,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst ret void } @@ -7441,7 +7441,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst ret void } @@ -7754,7 +7754,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -8053,7 +8053,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8370,7 +8370,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8698,7 +8698,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9042,7 +9042,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9386,7 +9386,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9703,7 +9703,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10020,7 +10020,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10364,7 +10364,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10708,7 +10708,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11052,7 +11052,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11396,7 +11396,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11740,7 +11740,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12084,7 +12084,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12428,7 +12428,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12772,7 +12772,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16295,7 +16295,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic ret void } @@ -16577,7 +16577,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic ret void } @@ -16857,7 +16857,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic ret void } @@ -17166,7 +17166,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic ret void } @@ -17475,7 +17475,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic ret void } @@ -17757,7 +17757,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire ret void } @@ -18039,7 +18039,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire ret void } @@ -18348,7 +18348,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire ret void } @@ -18657,7 +18657,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire ret void } @@ -18966,7 +18966,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire ret void } @@ -19275,7 +19275,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst ret void } @@ -19584,7 +19584,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst ret void } @@ -19893,7 +19893,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst ret void } @@ -20202,7 +20202,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst ret void } @@ -20511,7 +20511,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst ret void } @@ -20810,7 +20810,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21138,7 +21138,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21466,7 +21466,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21821,7 +21821,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22176,7 +22176,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22504,7 +22504,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22832,7 +22832,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23187,7 +23187,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23542,7 +23542,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23897,7 +23897,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24252,7 +24252,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24607,7 +24607,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24962,7 +24962,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -25317,7 +25317,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -25672,7 +25672,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll index 27283beb4b877..b2b71c246c97b 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll @@ -3479,7 +3479,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic ret void } @@ -3765,7 +3765,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic ret void } @@ -4044,7 +4044,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release monotonic ret void } @@ -4356,7 +4356,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic ret void } @@ -4668,7 +4668,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic ret void } @@ -4954,7 +4954,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire ret void } @@ -5240,7 +5240,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire ret void } @@ -5552,7 +5552,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release acquire ret void } @@ -5864,7 +5864,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire ret void } @@ -6176,7 +6176,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire ret void } @@ -6488,7 +6488,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst ret void } @@ -6800,7 +6800,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst ret void } @@ -7112,7 +7112,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst ret void } @@ -7424,7 +7424,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst ret void } @@ -7736,7 +7736,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst ret void } @@ -8035,7 +8035,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8352,7 +8352,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8679,7 +8679,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9022,7 +9022,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9365,7 +9365,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9682,7 +9682,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9999,7 +9999,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10342,7 +10342,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10685,7 +10685,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11028,7 +11028,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11371,7 +11371,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11714,7 +11714,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12057,7 +12057,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12400,7 +12400,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12743,7 +12743,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16259,7 +16259,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic ret void } @@ -16541,7 +16541,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic ret void } @@ -16820,7 +16820,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic ret void } @@ -17128,7 +17128,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic ret void } @@ -17436,7 +17436,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic ret void } @@ -17718,7 +17718,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire ret void } @@ -18000,7 +18000,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire ret void } @@ -18308,7 +18308,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire ret void } @@ -18616,7 +18616,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire ret void } @@ -18924,7 +18924,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire ret void } @@ -19232,7 +19232,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst ret void } @@ -19540,7 +19540,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst ret void } @@ -19848,7 +19848,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst ret void } @@ -20156,7 +20156,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst ret void } @@ -20464,7 +20464,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst ret void } @@ -20763,7 +20763,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21091,7 +21091,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21418,7 +21418,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21772,7 +21772,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22126,7 +22126,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22454,7 +22454,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22782,7 +22782,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23136,7 +23136,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23490,7 +23490,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23844,7 +23844,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24198,7 +24198,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24552,7 +24552,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24906,7 +24906,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -25260,7 +25260,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -25614,7 +25614,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("cluster-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index a05f4c718c351..7d357922ac307 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -3094,7 +3094,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic ret void } @@ -3347,7 +3347,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic ret void } @@ -3600,7 +3600,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic ret void } @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic ret void } @@ -4106,7 +4106,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic ret void } @@ -4359,7 +4359,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire ret void } @@ -4612,7 +4612,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire ret void } @@ -4865,7 +4865,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire ret void } @@ -5118,7 +5118,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire ret void } @@ -5371,7 +5371,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire ret void } @@ -5624,7 +5624,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst ret void } @@ -5877,7 +5877,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst ret void } @@ -6130,7 +6130,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst ret void } @@ -6383,7 +6383,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst ret void } @@ -6636,7 +6636,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst ret void } @@ -6935,7 +6935,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7236,7 +7236,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7537,7 +7537,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7838,7 +7838,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8139,7 +8139,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8440,7 +8440,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8741,7 +8741,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9042,7 +9042,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9343,7 +9343,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9644,7 +9644,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9945,7 +9945,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10246,7 +10246,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10547,7 +10547,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10848,7 +10848,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11149,7 +11149,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -14237,7 +14237,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic ret void } @@ -14490,7 +14490,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic ret void } @@ -14743,7 +14743,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic ret void } @@ -14996,7 +14996,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic ret void } @@ -15249,7 +15249,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic ret void } @@ -15502,7 +15502,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire ret void } @@ -15755,7 +15755,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire ret void } @@ -16008,7 +16008,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire ret void } @@ -16261,7 +16261,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire ret void } @@ -16514,7 +16514,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire ret void } @@ -16767,7 +16767,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst ret void } @@ -17020,7 +17020,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst ret void } @@ -17273,7 +17273,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst ret void } @@ -17526,7 +17526,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst ret void } @@ -17779,7 +17779,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst ret void } @@ -18078,7 +18078,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18379,7 +18379,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18680,7 +18680,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18981,7 +18981,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19282,7 +19282,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19583,7 +19583,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19884,7 +19884,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20185,7 +20185,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20486,7 +20486,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20787,7 +20787,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21088,7 +21088,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21389,7 +21389,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21690,7 +21690,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21991,7 +21991,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22292,7 +22292,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 74065146fd385..d5b37650ae9cc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -3530,7 +3530,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic ret void } @@ -3818,7 +3818,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic ret void } @@ -4102,7 +4102,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic ret void } @@ -4421,7 +4421,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic ret void } @@ -4740,7 +4740,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic ret void } @@ -5028,7 +5028,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire ret void } @@ -5316,7 +5316,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire ret void } @@ -5635,7 +5635,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire ret void } @@ -5954,7 +5954,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire ret void } @@ -6273,7 +6273,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire ret void } @@ -6592,7 +6592,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst ret void } @@ -6911,7 +6911,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst ret void } @@ -7230,7 +7230,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst ret void } @@ -7549,7 +7549,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst ret void } @@ -7868,7 +7868,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst ret void } @@ -8167,7 +8167,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8486,7 +8486,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8818,7 +8818,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9168,7 +9168,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9518,7 +9518,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9837,7 +9837,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10156,7 +10156,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10506,7 +10506,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10856,7 +10856,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11206,7 +11206,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11556,7 +11556,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11906,7 +11906,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12256,7 +12256,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12606,7 +12606,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12956,7 +12956,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16523,7 +16523,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic ret void } @@ -16807,7 +16807,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic ret void } @@ -17091,7 +17091,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic ret void } @@ -17406,7 +17406,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic ret void } @@ -17721,7 +17721,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic ret void } @@ -18005,7 +18005,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire ret void } @@ -18289,7 +18289,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire ret void } @@ -18604,7 +18604,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire ret void } @@ -18919,7 +18919,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire ret void } @@ -19234,7 +19234,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire ret void } @@ -19549,7 +19549,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst ret void } @@ -19864,7 +19864,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst ret void } @@ -20179,7 +20179,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst ret void } @@ -20494,7 +20494,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst ret void } @@ -20809,7 +20809,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst ret void } @@ -21108,7 +21108,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21438,7 +21438,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21770,7 +21770,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22131,7 +22131,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22492,7 +22492,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22822,7 +22822,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23152,7 +23152,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23513,7 +23513,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23874,7 +23874,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24235,7 +24235,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24596,7 +24596,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -24957,7 +24957,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -25318,7 +25318,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -25679,7 +25679,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -26040,7 +26040,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 8734e7152e281..b8e324ff5f458 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -3094,7 +3094,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic ret void } @@ -3347,7 +3347,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic ret void } @@ -3600,7 +3600,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic ret void } @@ -3853,7 +3853,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic ret void } @@ -4106,7 +4106,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic ret void } @@ -4359,7 +4359,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire ret void } @@ -4612,7 +4612,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire ret void } @@ -4865,7 +4865,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire ret void } @@ -5118,7 +5118,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire ret void } @@ -5371,7 +5371,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire ret void } @@ -5624,7 +5624,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst ret void } @@ -5877,7 +5877,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst ret void } @@ -6130,7 +6130,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst ret void } @@ -6383,7 +6383,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst ret void } @@ -6636,7 +6636,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } @@ -6935,7 +6935,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7236,7 +7236,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7537,7 +7537,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7838,7 +7838,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8139,7 +8139,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8440,7 +8440,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8741,7 +8741,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9042,7 +9042,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9343,7 +9343,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9644,7 +9644,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9945,7 +9945,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10246,7 +10246,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10547,7 +10547,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10848,7 +10848,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11149,7 +11149,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -14237,7 +14237,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic ret void } @@ -14490,7 +14490,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic ret void } @@ -14743,7 +14743,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic ret void } @@ -14996,7 +14996,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic ret void } @@ -15249,7 +15249,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic ret void } @@ -15502,7 +15502,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire ret void } @@ -15755,7 +15755,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire ret void } @@ -16008,7 +16008,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire ret void } @@ -16261,7 +16261,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire ret void } @@ -16514,7 +16514,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire ret void } @@ -16767,7 +16767,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst ret void } @@ -17020,7 +17020,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst ret void } @@ -17273,7 +17273,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst ret void } @@ -17526,7 +17526,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst ret void } @@ -17779,7 +17779,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst ret void } @@ -18078,7 +18078,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18379,7 +18379,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18680,7 +18680,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18981,7 +18981,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19282,7 +19282,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19583,7 +19583,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19884,7 +19884,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20185,7 +20185,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20486,7 +20486,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20787,7 +20787,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21088,7 +21088,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21389,7 +21389,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21690,7 +21690,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21991,7 +21991,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index d384aec2a2b19..d44e7fff2359f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -3401,7 +3401,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic ret void } @@ -3674,7 +3674,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic ret void } @@ -3951,7 +3951,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic ret void } @@ -4250,7 +4250,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic ret void } @@ -4549,7 +4549,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic ret void } @@ -4822,7 +4822,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire ret void } @@ -5095,7 +5095,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire ret void } @@ -5394,7 +5394,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire ret void } @@ -5693,7 +5693,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire ret void } @@ -5992,7 +5992,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire ret void } @@ -6291,7 +6291,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst ret void } @@ -6590,7 +6590,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6903,7 +6903,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7228,7 +7228,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7562,7 +7562,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7896,7 +7896,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8209,7 +8209,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8522,7 +8522,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8856,7 +8856,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9190,7 +9190,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9524,7 +9524,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9861,7 +9861,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10198,7 +10198,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10532,7 +10532,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10866,7 +10866,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11200,7 +11200,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -14536,7 +14536,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic ret void } @@ -14800,7 +14800,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic ret void } @@ -15073,7 +15073,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic ret void } @@ -15360,7 +15360,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic ret void } @@ -15647,7 +15647,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic ret void } @@ -15911,7 +15911,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire ret void } @@ -16175,7 +16175,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire ret void } @@ -16462,7 +16462,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire ret void } @@ -16749,7 +16749,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire ret void } @@ -17036,7 +17036,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire ret void } @@ -17320,7 +17320,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst ret void } @@ -17604,7 +17604,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst ret void } @@ -17891,7 +17891,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst ret void } @@ -18178,7 +18178,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst ret void } @@ -18465,7 +18465,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst ret void } @@ -18764,7 +18764,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19073,7 +19073,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19394,7 +19394,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19726,7 +19726,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20058,7 +20058,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20367,7 +20367,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20676,7 +20676,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21008,7 +21008,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21340,7 +21340,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21672,7 +21672,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22001,7 +22001,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22330,7 +22330,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22662,7 +22662,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22994,7 +22994,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23326,7 +23326,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll index 9713689217cf7..db82530f66aa4 100644 --- a/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/neg_ashr64_reduce.ll @@ -49,16 +49,18 @@ define <3 x i64> @v3_ashr_metadata(ptr %arg0.ptr, ptr %arg1.ptr) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; CHECK-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 +; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dword v4, v[0:1] offset:20 -; CHECK-NEXT: flat_load_dword v6, v[2:3] offset:16 +; CHECK-NEXT: flat_load_dword v4, v[2:3] offset:16 ; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] -; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: v_mov_b32_e32 v3, -1 +; CHECK-NEXT: flat_load_dword v1, v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v4, v6, v4 ; CHECK-NEXT: v_ashrrev_i32_e32 v0, v8, v5 ; CHECK-NEXT: v_ashrrev_i32_e32 v2, v10, v7 +; CHECK-NEXT: v_ashrrev_i32_e32 v4, v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, -1 ; CHECK-NEXT: v_mov_b32_e32 v5, -1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %val = load <3 x i64>, ptr %arg0.ptr, !range !4, !noundef !{} diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll index 7e4be65898b65..587e454da884c 100644 --- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -11,63 +11,73 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v ; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base ; CHECK-NEXT: s_movk_i32 s34, 0x80 ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v21, s35 +; CHECK-NEXT: s_add_nc_u64 s[44:45], s[34:35], 0x70 +; CHECK-NEXT: v_dual_mov_b32 v26, s34 :: v_dual_mov_b32 v27, s35 +; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 ; CHECK-NEXT: s_wait_kmcnt 0x0 ; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 ; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 ; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37 ; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39 +; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; CHECK-NEXT: s_add_nc_u64 s[24:25], s[34:35], 0x60 ; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29 ; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31 -; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 -; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 ; CHECK-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21 +; CHECK-NEXT: s_add_nc_u64 s[20:21], s[34:35], 0x50 +; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v23, s25 ; CHECK-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23 +; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20 ; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 ; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:112 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:96 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] offset:80 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[24:25], v[16:19] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; CHECK-NEXT: s_add_nc_u64 s[12:13], s[34:35], 48 ; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 ; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 -; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 ; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12 ; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 ; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 ; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 ; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 ; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 ; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 -; CHECK-NEXT: flat_store_b128 v[20:21], v[0:3] offset:64 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[4:7] offset:48 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:32 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[8:11] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:16 scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[12:15] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] scope:SCOPE_SYS +; CHECK-NEXT: flat_store_b128 v[26:27], v[16:19] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_storecnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:96 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:112 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:64 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:64 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:80 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:32 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:32 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:48 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 -; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:16 scope:SCOPE_SYS +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] offset:16 scope:SCOPE_SYS ; CHECK-NEXT: s_wait_loadcnt 0x0 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index e50ed3ee95140..4d189d7a9673d 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -1492,7 +1492,7 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -16777215 + %gep = getelementptr inbounds i8, ptr %p, i64 -16777215 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1597,7 +1597,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589936639 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936639 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1702,7 +1702,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589936640 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936640 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1807,7 +1807,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589938687 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938687 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1903,7 +1903,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589938688 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938688 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2008,7 +2008,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589942783 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942783 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2104,7 +2104,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589942784 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942784 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2211,7 +2211,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773761 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2318,7 +2318,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773760 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2425,7 +2425,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771713 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2532,7 +2532,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771712 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2639,7 +2639,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767617 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -2746,7 +2746,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767616 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -4232,7 +4232,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589936639 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936639 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -4351,7 +4351,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589936640 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936640 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -4470,7 +4470,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589938687 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938687 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -4590,7 +4590,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589938688 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938688 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -4710,7 +4710,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589942783 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942783 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -4830,7 +4830,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589942784 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942784 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -4955,7 +4955,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773761 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -5080,7 +5080,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773760 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -5205,7 +5205,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771713 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -5330,7 +5330,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771712 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -5455,7 +5455,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767617 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -5580,7 +5580,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767616 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll index 4b03896043dbb..23468a128285d 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll @@ -304,62 +304,72 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX906-NEXT: ; use s29 ; GFX906-NEXT: ;;#ASMEND ; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX906-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload ; GFX906-NEXT: v_readlane_b32 s30, v41, 0 ; GFX906-NEXT: v_readlane_b32 s31, v41, 1 ; GFX906-NEXT: s_mov_b32 s32, s33 ; GFX906-NEXT: v_readlane_b32 s4, v41, 4 ; GFX906-NEXT: v_readlane_b32 s34, v41, 2 ; GFX906-NEXT: v_readlane_b32 s35, v41, 3 +; GFX906-NEXT: s_waitcnt vmcnt(33) +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x70, v2 +; GFX906-NEXT: s_waitcnt vmcnt(32) +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[32:35] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x60, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX906-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX906-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX906-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1 @@ -676,57 +686,67 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { ; GFX908-NEXT: ; use s29 ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v34, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v35, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload ; GFX908-NEXT: s_mov_b64 s[4:5], exec +; GFX908-NEXT: s_waitcnt vmcnt(33) +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x70, v2 +; GFX908-NEXT: s_waitcnt vmcnt(32) +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[32:35] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x60, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[28:31] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:64 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 48, v2 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:32 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:16 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX908-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: s_mov_b64 exec, 3 ; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:168 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 85a9aba1a0e51..57eb25700c76a 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -2658,7 +2658,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX11-FAKE16-NEXT: s_endpgm entry: %null = select i1 false, ptr %buffer, ptr addrspacecast (ptr addrspace(5) null to ptr) - %gep = getelementptr i8, ptr %null, i64 -1 + %gep = getelementptr inbounds i8, ptr %null, i64 -1 %ld = load i8, ptr %gep %cmp = icmp eq i8 %ld, 0 br label %branch